Spaces:

findConsole
/

PromptTesting

Running

App Files Files Community

rawanessam commited on 17 days ago

Commit

dc65367

verified ·

1 Parent(s): 0bd12fb

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -27

app.py CHANGED Viewed

@@ -3,10 +3,26 @@ import os
 import json
 import requests
 from io import BytesIO
 import pandas as pd
 import fitz  # PyMuPDF
 from urllib.parse import urlparse, unquote
 import re
 import logging
 # Set up logging to see everything
@@ -95,9 +111,9 @@ def openPDF(pdf_path):
 def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
-    Returns a list of dicts: {text, page, suggested_level, confidence, body}.
     The function sends plain page-line strings to the LLM (including page numbers)
-    and asks for a JSON array containing headers with suggested levels and body for the last header.
     """
     logger.info("=" * 80)
     logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
@@ -141,10 +157,17 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
                 y1 = spans[0]['bbox'][3]
                 # if y0 < top_margin or y1 > (page_height - bottom_margin):
                 #     continue
-                text = " ".join(s.get('text','') for s in spans).strip()
-                if text:
                     # prefix with page for easier mapping back
-                    lines_for_prompt.append(f"PAGE {pno+1}: {text}")
                     lines_on_page += 1
         if lines_on_page > 0:
@@ -165,6 +188,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
     print("=" * 80)
     print("FULL LLM PROMPT:")
     print(prompt)
@@ -179,11 +203,13 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         logger.error(f"Could not save prompt to file: {e}")
     if not api_key:
         logger.error("No API key provided")
         return []
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
@@ -191,9 +217,11 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
     }
     logger.info(f"Making request to OpenRouter with model: {model}")
     logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
     body = {
         "model": model,
         "messages": [
@@ -206,9 +234,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         ]
     }
     try:
         logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
         resp = requests.post(
             url=url,
             headers=headers,
@@ -219,6 +250,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         resp.raise_for_status()
         resp_text = resp.text
         print("=" * 80)
         print("FULL LLM RESPONSE:")
         print(resp_text)
@@ -226,6 +258,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         logger.info(f"LLM raw response length: {len(resp_text)}")
         try:
             with open("llm_debug.json", "w", encoding="utf-8") as fh:
                 fh.write(resp_text)
@@ -284,12 +317,14 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     if not text_reply:
         logger.error("Could not extract text reply from response")
         print("=" * 80)
         print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
         print(json.dumps(rj, indent=2))
         print("=" * 80)
         return []
     print("=" * 80)
     print("EXTRACTED TEXT REPLY:")
     print(text_reply)
@@ -312,7 +347,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     except json.JSONDecodeError as e:
         logger.error(f"Failed to parse JSON: {e}")
         logger.error(f"JSON string that failed to parse: {js[:1000]}")
         try:
             import re
             json_pattern = r'\[\s*\{.*?\}\s*\]'
             matches = re.findall(json_pattern, text_reply, re.DOTALL)
@@ -338,7 +375,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     # Log parsed results
     logger.info(f"Parsed {len(parsed)} header items:")
-    for i, obj in enumerate(parsed[:10]):
         logger.info(f"  Item {i}: {obj}")
     # Normalize parsed entries and return
@@ -348,24 +385,10 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         page = int(obj.get('page')) if obj.get('page') else None
         level = obj.get('suggested_level')
         conf = float(obj.get('confidence') or 0)
-        body = obj.get('body', '')  # Get body content, default to empty string
         if t and page is not None:
-            out.append({
-                'text': t,
-                'page': page-1,
-                'suggested_level': level,
-                'confidence': conf,
-                'body': body  # Add body to output
-            })
-    logger.info(f"Returning {len(out)} valid header entries with body content for last header")
-    # Log which entries have body content
-    for i, item in enumerate(out):
-        if item.get('body'):
-            logger.info(f"Entry {i} has body content (length: {len(item['body'])})")
     return out
@@ -390,17 +413,13 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
-    # Check which rows have body content
-    bodies = df['body'].tolist()
-    non_empty_bodies = [b for b in bodies if b and str(b).strip()]
-    logger.info(f"Found {len(non_empty_bodies)} entries with body content")
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
         df.to_excel(output_path, index=False, engine='openpyxl')
         logger.info(f"Excel file saved successfully to: {output_path}")
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
@@ -421,6 +440,7 @@ iface = gr.Interface(
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")
 )
 if __name__ == "__main__":

 import json
 import requests
 from io import BytesIO
+import gradio as gr
 import pandas as pd
+from io import BytesIO
 import fitz  # PyMuPDF
 from urllib.parse import urlparse, unquote
+import os
+from io import BytesIO
+import re
+import requests
+import pandas as pd
+import fitz  # PyMuPDF
 import re
+import urllib.parse
+import difflib
+from fuzzywuzzy import fuzz
+import copy
+# import tsadropboxretrieval
+import urllib.parse
 import logging
 # Set up logging to see everything
 def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
+    Returns a list of dicts: {text, page, suggested_level, confidence}.
     The function sends plain page-line strings to the LLM (including page numbers)
+    and asks for a JSON array containing only header lines with suggested levels.
     """
     logger.info("=" * 80)
     logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
                 y1 = spans[0]['bbox'][3]
                 # if y0 < top_margin or y1 > (page_height - bottom_margin):
                 #     continue
+                for s in spans:
+                    # text,font,size,flags,color
+                    ArrayofTextWithFormat={s.get('text')}
                     # prefix with page for easier mapping back
+                    lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
+                # text = " ".join(s.get('text','') for s in spans).strip()
+                # if text:
+                #     # prefix with page for easier mapping back
+                #     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
                     lines_on_page += 1
         if lines_on_page > 0:
     prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
+    # Changed: Print entire prompt, not truncated
     print("=" * 80)
     print("FULL LLM PROMPT:")
     print(prompt)
         logger.error(f"Could not save prompt to file: {e}")
     if not api_key:
+        # No API key: return empty so caller can fallback to heuristics
         logger.error("No API key provided")
         return []
     url = "https://openrouter.ai/api/v1/chat/completions"
+    # Build headers following the OpenRouter example
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
         "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
     }
+    # Log request details (without exposing full API key)
     logger.info(f"Making request to OpenRouter with model: {model}")
     logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
+    # Wrap the prompt as the example 'content' array expected by OpenRouter
     body = {
         "model": model,
         "messages": [
         ]
     }
+    # Debug: log request body (truncated) and write raw response for inspection
     try:
+        # Changed: Log full body (excluding prompt text which is already logged)
         logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
+        # Removed timeout parameter
         resp = requests.post(
             url=url,
             headers=headers,
         resp.raise_for_status()
         resp_text = resp.text
+        # Changed: Print entire response
         print("=" * 80)
         print("FULL LLM RESPONSE:")
         print(resp_text)
         logger.info(f"LLM raw response length: {len(resp_text)}")
+        # Save raw response for offline inspection
         try:
             with open("llm_debug.json", "w", encoding="utf-8") as fh:
                 fh.write(resp_text)
     if not text_reply:
         logger.error("Could not extract text reply from response")
+        # Changed: Print the entire response structure for debugging
         print("=" * 80)
         print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
         print(json.dumps(rj, indent=2))
         print("=" * 80)
         return []
+    # Changed: Print the extracted text reply
     print("=" * 80)
     print("EXTRACTED TEXT REPLY:")
     print(text_reply)
     except json.JSONDecodeError as e:
         logger.error(f"Failed to parse JSON: {e}")
         logger.error(f"JSON string that failed to parse: {js[:1000]}")
+        # Try to find any JSON-like structure
         try:
+            # Try to extract any JSON array
             import re
             json_pattern = r'\[\s*\{.*?\}\s*\]'
             matches = re.findall(json_pattern, text_reply, re.DOTALL)
     # Log parsed results
     logger.info(f"Parsed {len(parsed)} header items:")
+    for i, obj in enumerate(parsed[:10]):  # Log first 10 items
         logger.info(f"  Item {i}: {obj}")
     # Normalize parsed entries and return
         page = int(obj.get('page')) if obj.get('page') else None
         level = obj.get('suggested_level')
         conf = float(obj.get('confidence') or 0)
         if t and page is not None:
+            out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
+    logger.info(f"Returning {len(out)} valid header entries")
     return out
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
         df.to_excel(output_path, index=False, engine='openpyxl')
         logger.info(f"Excel file saved successfully to: {output_path}")
+        # Verify file was created
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")
 )
 if __name__ == "__main__":