Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

Marthee commited on 18 days ago

Commit

a966ccd

verified ·

1 Parent(s): 7c121c1

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -18

app.py CHANGED Viewed

@@ -405,14 +405,178 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
         return None
     logger.info(f"Got {len(result)} results, creating DataFrame")
-    df = pd.DataFrame(result)
-    # Log DataFrame info
-    logger.info(f"DataFrame shape: {df.shape}")
-    logger.info(f"DataFrame columns: {df.columns.tolist()}")
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
@@ -423,20 +587,6 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
-        else:
-            logger.error(f"Output file was not created at: {output_path}")
-    except Exception as e:
-        logger.error(f"Failed to save Excel file: {e}")
-        return None
-    return output_path  # return file path, not BytesIO
-iface = gr.Interface(
-    fn=identify_headers_and_save_excel,
-    inputs=[
-        gr.Textbox(label="Document Link"),
-        gr.Textbox(label="Model Type"),
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")

         return None
     logger.info(f"Got {len(result)} results, creating DataFrame")
+import json
+import requests
+from io import BytesIO
+import gradio as gr
+import pandas as pd
+from io import BytesIO
+import fitz  # PyMuPDF
+from urllib.parse import urlparse, unquote
+import os
+from io import BytesIO
+import re
+import requests
+import pandas as pd
+import fitz  # PyMuPDF
+import re
+import urllib.parse
+import difflib
+from fuzzywuzzy import fuzz
+import copy
+# import tsadropboxretrieval
+import urllib.parse
+import logging
+# Set up logging to see everything
+def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
+    """Ask an LLM (OpenRouter) to identify headers in the document.
+    Returns a list of dicts: {text, page, suggested_level, confidence}.
+    The function sends plain page-line strings to the LLM (including page numbers)
+    and asks for a JSON array containing only header lines with suggested levels.
+    """
+    logger.info("=" * 80)
+    logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
+                y1 = spans[0]['bbox'][3]
+                # if y0 < top_margin or y1 > (page_height - bottom_margin):
+                #     continue
+                for s in spans:
+                    # text,font,size,flags,color
+                    ArrayofTextWithFormat={s.get('text')}
+                    # prefix with page for easier mapping back
+                    lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
+                # text = " ".join(s.get('text','') for s in spans).strip()
+                # if text:
+                #     # prefix with page for easier mapping back
+                #     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
+                    lines_on_page += 1
+        if lines_on_page > 0:
+    prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
+    logger.debug(f"Full prompt length: {len(prompt)} characters")
+    # Changed: Print entire prompt, not truncated
+    print("=" * 80)
+    print("FULL LLM PROMPT:")
+    print(prompt)
+        logger.error(f"Could not save prompt to file: {e}")
+    if not api_key:
+        # No API key: return empty so caller can fallback to heuristics
+        logger.error("No API key provided")
+        return []
+    url = "https://openrouter.ai/api/v1/chat/completions"
+    # Build headers following the OpenRouter example
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
+    }
+    # Log request details (without exposing full API key)
+    logger.info(f"Making request to OpenRouter with model: {model}")
+    logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
+    # Wrap the prompt as the example 'content' array expected by OpenRouter
+    body = {
+        "model": model,
+        "messages": [
+        ]
+    }
+    # Debug: log request body (truncated) and write raw response for inspection
+    try:
+        # Changed: Log full body (excluding prompt text which is already logged)
+        logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
+        # Removed timeout parameter
+        resp = requests.post(
+            url=url,
+            headers=headers,
+        resp.raise_for_status()
+        resp_text = resp.text
+        # Changed: Print entire response
+        print("=" * 80)
+        print("FULL LLM RESPONSE:")
+        print(resp_text)
+        logger.info(f"LLM raw response length: {len(resp_text)}")
+        # Save raw response for offline inspection
+        try:
+            with open("llm_debug.json", "w", encoding="utf-8") as fh:
+                fh.write(resp_text)
+    if not text_reply:
+        logger.error("Could not extract text reply from response")
+        # Changed: Print the entire response structure for debugging
+        print("=" * 80)
+        print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
+        print(json.dumps(rj, indent=2))
+        print("=" * 80)
+        return []
+    # Changed: Print the extracted text reply
+    print("=" * 80)
+    print("EXTRACTED TEXT REPLY:")
+    print(text_reply)
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse JSON: {e}")
+        logger.error(f"JSON string that failed to parse: {js[:1000]}")
+        # Try to find any JSON-like structure
+        try:
+            # Try to extract any JSON array
+            import re
+            json_pattern = r'\[\s*\{.*?\}\s*\]'
+            matches = re.findall(json_pattern, text_reply, re.DOTALL)
+    # Log parsed results
+    logger.info(f"Parsed {len(parsed)} header items:")
+    for i, obj in enumerate(parsed[:10]):  # Log first 10 items
+        logger.info(f"  Item {i}: {obj}")
+    # Normalize parsed entries and return
+        page = int(obj.get('page')) if obj.get('page') else None
+        level = obj.get('suggested_level')
+        conf = float(obj.get('confidence') or 0)
+        if t and page is not None:
+            out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
+    logger.info(f"Returning {len(out)} valid header entries")
+    return out
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")