import gradio as gr import os import json import requests from io import BytesIO import fitz # PyMuPDF from urllib.parse import urlparse, unquote import os from io import BytesIO import re import requests import pandas as pd import fitz # PyMuPDF import re import urllib.parse import difflib from fuzzywuzzy import fuzz import copy # import tsadropboxretrieval import urllib.parse def get_toc_page_numbers(doc, max_pages_to_check=15): toc_pages = [] # 1. Existing Dot Pattern (looking for ".....") dot_pattern = re.compile(r"\.{2,}") # 2. NEW: Title Pattern (looking for specific headers) # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...") # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc. title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE) for page_num in range(min(len(doc), max_pages_to_check)): page = doc.load_page(page_num) blocks = page.get_text("dict")["blocks"] dot_line_count = 0 has_toc_title = False for block in blocks: for line in block.get("lines", []): # Extract text from spans (mimicking get_spaced_text_from_spans) line_text = " ".join([span["text"] for span in line["spans"]]).strip() # CHECK A: Does the line have dots? if dot_pattern.search(line_text): dot_line_count += 1 # CHECK B: Is this line a Title? # We check this early in the loop. If a page has a title "Contents", # we mark it immediately. if title_pattern.match(line_text): has_toc_title = True # CONDITION: # It is a TOC page if it has a Title OR if it has dot leaders. # We use 'dot_line_count >= 1' to be sensitive to single-item lists. if has_toc_title or dot_line_count >= 1: toc_pages.append(page_num) # RETURN: # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3] # This covers the cover page, inside cover, and the TOC itself. if toc_pages: last_toc_page = toc_pages[0] return list(range(0, last_toc_page + 1)) return [] # Return empty list if nothing found def openPDF(pdf_path): pdf_path = pdf_path.replace('dl=0', 'dl=1') response = requests.get(pdf_path) pdf_content = BytesIO(response.content) if not pdf_content: raise ValueError("No valid PDF content found.") doc = fitz.open(stream=pdf_content, filetype="pdf") return doc def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=70, bottom_margin=85): """Ask an LLM (OpenRouter) to identify headers in the document. Returns a list of dicts: {text, page, suggested_level, confidence}. The function sends plain page-line strings to the LLM (including page numbers) and asks for a JSON array containing only header lines with suggested levels. """ doc=openPDF(pdf_path) api_key='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' if api_key is None: api_key = os.getenv("OPENROUTER_API_KEY") or None model=str(model) toc_pages = get_toc_page_numbers(doc) lines_for_prompt = [] # Collect text lines from pages (skip TOC pages) for pno in range(len(doc)): if pages_to_check and pno not in pages_to_check: continue if pno in toc_pages: continue page = doc.load_page(pno) page_height = page.rect.height for block in page.get_text("dict").get('blocks', []): if block.get('type') != 0: continue for line in block.get('lines', []): spans = line.get('spans', []) if not spans: continue y0 = spans[0]['bbox'][1] y1 = spans[0]['bbox'][3] if y0 < top_margin or y1 > (page_height - bottom_margin): continue text = " ".join(s.get('text','') for s in spans).strip() if text: # prefix with page for easier mapping back lines_for_prompt.append(f"PAGE {pno+1}: {text}") if not lines_for_prompt: return [] prompt = ( LLM_prompt.join(lines_for_prompt) ) if not api_key: # No API key: return empty so caller can fallback to heuristics return [] url = "https://openrouter.ai/api/v1/chat/completions" # Build headers following the OpenRouter example headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""), "X-Title": os.getenv("OPENROUTER_X_TITLE", "") } # Wrap the prompt as the example 'content' array expected by OpenRouter body = { "model": model, "messages": [ { "role": "user", "content": [ {"type": "text", "text": prompt} ] } ] } # Debug: log request body (truncated) and write raw response for inspection try: print("LLM request (truncated):", prompt[:1000]) resp = requests.post( url=url, headers=headers, data=json.dumps(body), ) resp.raise_for_status() resp_text = resp.text print("LLM raw response length:", len(resp_text)) # Save raw response for offline inspection try: with open("llm_debug.json", "w", encoding="utf-8") as fh: fh.write(resp_text) except Exception as e: print("Warning: could not write llm_debug.json:", e) rj = resp.json() print("LLM parsed response keys:", list(rj.keys()) if isinstance(rj, dict) else type(rj)) except Exception as e: print("LLM call failed:", repr(e)) return [] # Extract textual reply robustly text_reply = None if isinstance(rj, dict): choices = rj.get('choices') or [] if choices: c0 = choices[0] msg = c0.get('message') or c0.get('delta') or {} content = msg.get('content') if isinstance(content, list): for c in content: if c.get('type') == 'text' and c.get('text'): text_reply = c.get('text') break elif isinstance(content, str): text_reply = content elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict): text_reply = msg.get('content').get('text') if not text_reply: for c in rj.get('choices', []): if isinstance(c.get('text'), str): text_reply = c.get('text') break if not text_reply: return [] s = text_reply.strip() start = s.find('[') end = s.rfind(']') js = s[start:end+1] if start != -1 and end != -1 else s try: parsed = json.loads(js) except Exception: return [] # Normalize parsed entries and return out = [] for obj in parsed: t = obj.get('text') page = int(obj.get('page')) if obj.get('page') else None level = obj.get('suggested_level') conf = float(obj.get('confidence') or 0) if t and page is not None: out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf}) return out # Wrapper function to convert JSON to a dataframe-friendly format def identify_headers_with_table(pdf_path, model, LLM_prompt): # Call your existing function result = identify_headers_with_openrouter(pdf_path, model, LLM_prompt) # Convert list of dicts to list of lists for Gradio Dataframe if not result: return [] # empty table if no results table_data = [[item['text'], item['page']+1, item['suggested_level'], item['confidence']] for item in result] return table_data # Column names for the table columns = ["Text", "Page", "Suggested Level", "Confidence"] # Gradio Interface iface = gr.Interface( fn=identify_headers_with_table, inputs=[ gr.Textbox(label="Document Link"), gr.Textbox(label="Model Type"), gr.Textbox(label="LLM Prompt") ], outputs=gr.Dataframe(headers=columns) ) iface.launch()