Spaces:

findConsole
/

PromptTesting

Running

App Files Files Community

Marthee commited on 17 days ago

Commit

d681c26

verified ·

1 Parent(s): 212ec1f

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -217

app.py CHANGED Viewed

@@ -125,13 +125,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     api_key = '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'
     if api_key is None:
         api_key = os.getenv("OPENROUTER_API_KEY") or None
     model = str(model)
     toc_pages = get_toc_page_numbers(doc)
     lines_for_prompt = []
     logger.info(f"TOC pages to skip: {toc_pages}")
-    logger.info(f"Total pages in document: {len(doc)}")
     # Collect text lines from pages (skip TOC pages)
     total_lines = 0
@@ -145,30 +144,46 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         page = doc.load_page(pno)
         page_height = page.rect.height
         lines_on_page = 0
-        for block in page.get_text("dict").get('blocks', []):
-            if block.get('type') != 0:
                 continue
-            for line in block.get('lines', []):
-                spans = line.get('spans', [])
-                if not spans:
-                    continue
-                y0 = spans[0]['bbox'][1]
-                y1 = spans[0]['bbox'][3]
-                # if y0 < top_margin or y1 > (page_height - bottom_margin):
-                #     continue
-                for s in spans:
-                    # text,font,size,flags,color
-                    ArrayofTextWithFormat={s.get('text')}
-                    # prefix with page for easier mapping back
-                    lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
-                # text = " ".join(s.get('text','') for s in spans).strip()
-                # if text:
-                #     # prefix with page for easier mapping back
-                #     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
-                    lines_on_page += 1
         if lines_on_page > 0:
             logger.debug(f"Page {pno}: collected {lines_on_page} lines")
@@ -185,7 +200,8 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     for i, line in enumerate(lines_for_prompt[:10]):
         logger.info(f"  {i}: {line}")
-    prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
     # Changed: Print entire prompt, not truncated
@@ -393,207 +409,26 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
 def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
-    logger.info("=" * 80)
-    logger.info("STARTING IDENTIFY_HEADERS_AND_SAVE_EXCEL")
-    logger.info(f"Inputs - PDF: {pdf_path}, Model: {model}")
     # Call your existing function
     result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
     if not result:
-        logger.warning("No results returned from identify_headers_with_openrouter")
-        return None
-    logger.info(f"Got {len(result)} results, creating DataFrame")
-import json
-import requests
-from io import BytesIO
-import gradio as gr
-import pandas as pd
-from io import BytesIO
-import fitz  # PyMuPDF
-from urllib.parse import urlparse, unquote
-import os
-from io import BytesIO
-import re
-import requests
-import pandas as pd
-import fitz  # PyMuPDF
-import re
-import urllib.parse
-import difflib
-from fuzzywuzzy import fuzz
-import copy
-# import tsadropboxretrieval
-import urllib.parse
-import logging
-# Set up logging to see everything
-def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
-    """Ask an LLM (OpenRouter) to identify headers in the document.
-    Returns a list of dicts: {text, page, suggested_level, confidence}.
-    The function sends plain page-line strings to the LLM (including page numbers)
-    and asks for a JSON array containing only header lines with suggested levels.
-    """
-    logger.info("=" * 80)
-    logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
-                y1 = spans[0]['bbox'][3]
-                # if y0 < top_margin or y1 > (page_height - bottom_margin):
-                #     continue
-                for s in spans:
-                    # text,font,size,flags,color
-                    ArrayofTextWithFormat={s.get('text')}
-                    # prefix with page for easier mapping back
-                    lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
-                # text = " ".join(s.get('text','') for s in spans).strip()
-                # if text:
-                #     # prefix with page for easier mapping back
-                #     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
-                    lines_on_page += 1
-        if lines_on_page > 0:
-    prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
-    logger.debug(f"Full prompt length: {len(prompt)} characters")
-    # Changed: Print entire prompt, not truncated
-    print("=" * 80)
-    print("FULL LLM PROMPT:")
-    print(prompt)
-        logger.error(f"Could not save prompt to file: {e}")
-    if not api_key:
-        # No API key: return empty so caller can fallback to heuristics
-        logger.error("No API key provided")
-        return []
-    url = "https://openrouter.ai/api/v1/chat/completions"
-    # Build headers following the OpenRouter example
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-        "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
-    }
-    # Log request details (without exposing full API key)
-    logger.info(f"Making request to OpenRouter with model: {model}")
-    logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
-    # Wrap the prompt as the example 'content' array expected by OpenRouter
-    body = {
-        "model": model,
-        "messages": [
-        ]
-    }
-    # Debug: log request body (truncated) and write raw response for inspection
-    try:
-        # Changed: Log full body (excluding prompt text which is already logged)
-        logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
-        # Removed timeout parameter
-        resp = requests.post(
-            url=url,
-            headers=headers,
-        resp.raise_for_status()
-        resp_text = resp.text
-        # Changed: Print entire response
-        print("=" * 80)
-        print("FULL LLM RESPONSE:")
-        print(resp_text)
-        logger.info(f"LLM raw response length: {len(resp_text)}")
-        # Save raw response for offline inspection
-        try:
-            with open("llm_debug.json", "w", encoding="utf-8") as fh:
-                fh.write(resp_text)
-    if not text_reply:
-        logger.error("Could not extract text reply from response")
-        # Changed: Print the entire response structure for debugging
-        print("=" * 80)
-        print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
-        print(json.dumps(rj, indent=2))
-        print("=" * 80)
-        return []
-    # Changed: Print the extracted text reply
-    print("=" * 80)
-    print("EXTRACTED TEXT REPLY:")
-    print(text_reply)
-    except json.JSONDecodeError as e:
-        logger.error(f"Failed to parse JSON: {e}")
-        logger.error(f"JSON string that failed to parse: {js[:1000]}")
-        # Try to find any JSON-like structure
-        try:
-            # Try to extract any JSON array
-            import re
-            json_pattern = r'\[\s*\{.*?\}\s*\]'
-            matches = re.findall(json_pattern, text_reply, re.DOTALL)
-    # Log parsed results
-    logger.info(f"Parsed {len(parsed)} header items:")
-    for i, obj in enumerate(parsed[:10]):  # Log first 10 items
-        logger.info(f"  Item {i}: {obj}")
-    # Normalize parsed entries and return
-        page = int(obj.get('page')) if obj.get('page') else None
-        level = obj.get('suggested_level')
-        conf = float(obj.get('confidence') or 0)
-        if t and page is not None:
-            out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
-    logger.info(f"Returning {len(out)} valid header entries")
-    return out
-    logger.info("DataFrame head:")
-    logger.info(df.head().to_string())
-    # Save Excel to a file on disk
-    output_path = "output.xlsx"
-    try:
-        df.to_excel(output_path, index=False, engine='openpyxl')
-        logger.info(f"Excel file saved successfully to: {output_path}")
-        # Verify file was created
-        if os.path.exists(output_path):
-            file_size = os.path.getsize(output_path)
-            logger.info(f"Output file exists, size: {file_size} bytes")
         gr.Textbox(label="LLM Prompt")
     ],
-    outputs = gr.File(file_count="single", label="Download Excel")
 )
-if __name__ == "__main__":
-    print("Starting Gradio interface...")
-    logger.info("Launching Gradio interface")
-    iface.launch()

     api_key = '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'
     if api_key is None:
         api_key = os.getenv("OPENROUTER_API_KEY") or None
     model = str(model)
     toc_pages = get_toc_page_numbers(doc)
     lines_for_prompt = []
+    pgestoRun=20
     logger.info(f"TOC pages to skip: {toc_pages}")
+    logger.info(f"Total pages in document: {pgestoRun}")
     # Collect text lines from pages (skip TOC pages)
     total_lines = 0
         page = doc.load_page(pno)
         page_height = page.rect.height
         lines_on_page = 0
+        text_dict = page.get_text("dict")
+        lines = []
+        y_tolerance = 0.2  # tweak if needed (1–3 usually works)
+        for block in text_dict["blocks"]:
+            if block["type"] != 0:
                 continue
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    text = span["text"].strip()
+                    if not text:
+                        continue
+                    x0, y0, x1, y1 = span["bbox"]
+                    matched = False
+                    for l in lines:
+                        if abs(l["y"] - y0) <= y_tolerance:
+                            l["spans"].append((x0, text))
+                            matched = True
+                            break
+                    if not matched:
+                        lines.append({
+                            "y": y0,
+                            "spans": [(x0, text)]
+                        })
+        lines.sort(key=lambda l: l["y"])
+        # Join text inside each line
+        final_lines = []
+        for l in lines:
+            l["spans"].sort(key=lambda s: s[0])  # left → right
+            line_text = " ".join(text for _, text in l["spans"])
+            final_lines.append(line_text)
+        # Result
+        for line in final_lines:
+            if text:
+                # prefix with page for easier mapping back
+                lines_for_prompt.append(f"PAGE {pno+1}: {line}")
+                lines_on_page += 1
         if lines_on_page > 0:
             logger.debug(f"Page {pno}: collected {lines_on_page} lines")
     for i, line in enumerate(lines_for_prompt[:10]):
         logger.info(f"  {i}: {line}")
+    prompt = "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
     # Changed: Print entire prompt, not truncated
 def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
     # Call your existing function
     result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
     if not result:
+    df = pd.DataFrame(result)
+    # Save Excel to a file on disk
+    output_path = "output.xlsx"
+    df.to_excel(output_path, index=False, engine='openpyxl')
+    return output_path  # return file path, not BytesIO
+iface = gr.Interface(
+    fn=identify_headers_and_save_excel,
+        gr.Textbox(label="Model Type"),
         gr.Textbox(label="LLM Prompt")
     ],
+    outputs=gr.File(label="Download Excel")  # File expects a path
 )
+iface.launch()