File size: 8,549 Bytes
09310e8
 
44130c7
09310e8
 
 
44130c7
012e651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44130c7
 
 
09310e8
 
44130c7
09310e8
 
44130c7
09310e8
 
 
 
44130c7
09310e8
 
 
44130c7
09310e8
 
44130c7
09310e8
 
 
 
44130c7
09310e8
 
 
44130c7
09310e8
 
 
 
 
44130c7
09310e8
 
 
 
 
44130c7
09310e8
 
 
 
 
 
 
 
44130c7
 
09310e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ff438b
09310e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44130c7
 
09310e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44130c7
09310e8
 
 
 
 
44130c7
09310e8
 
 
 
 
 
 
 
 
 
 
 
44130c7
09310e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44130c7
09310e8
 
 
 
 
 
 
 
 
44130c7
09310e8
44130c7
6d90c86
 
 
 
 
 
 
 
 
 
 
 
 
 
44130c7
 
6d90c86
09310e8
6d90c86
 
 
09310e8
6d90c86
 
 
09310e8
44130c7
09310e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import gradio as gr
import os
import json
import requests
from io import BytesIO
import fitz  # PyMuPDF

from urllib.parse import urlparse, unquote
import os
from io import BytesIO
import re
import requests
import pandas as pd
import fitz  # PyMuPDF
import re
import urllib.parse
import difflib
from fuzzywuzzy import fuzz
import copy
# import tsadropboxretrieval

import urllib.parse





def get_toc_page_numbers(doc, max_pages_to_check=15):
    toc_pages = []

    # 1. Existing Dot Pattern (looking for ".....")
    dot_pattern = re.compile(r"\.{2,}")

    # 2. NEW: Title Pattern (looking for specific headers)
    # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
    # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
    title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)

    for page_num in range(min(len(doc), max_pages_to_check)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        dot_line_count = 0
        has_toc_title = False

        for block in blocks:
            for line in block.get("lines", []):
                # Extract text from spans (mimicking get_spaced_text_from_spans)
                line_text = " ".join([span["text"] for span in line["spans"]]).strip()

                # CHECK A: Does the line have dots?
                if dot_pattern.search(line_text):
                    dot_line_count += 1

                # CHECK B: Is this line a Title?
                # We check this early in the loop. If a page has a title "Contents",
                # we mark it immediately.
                if title_pattern.match(line_text):
                    has_toc_title = True

        # CONDITION:
        # It is a TOC page if it has a Title OR if it has dot leaders.
        # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
        if has_toc_title or dot_line_count >= 1:
            toc_pages.append(page_num)

    # RETURN:
    # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
    # This covers the cover page, inside cover, and the TOC itself.
    if toc_pages:
        last_toc_page = toc_pages[0]
        return list(range(0, last_toc_page + 1))

    return [] # Return empty list if nothing found


def openPDF(pdf_path): 
    pdf_path = pdf_path.replace('dl=0', 'dl=1')
    response = requests.get(pdf_path)
    pdf_content = BytesIO(response.content)
    if not pdf_content:
        raise ValueError("No valid PDF content found.")
    
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    return doc

def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=70, bottom_margin=85):
    """Ask an LLM (OpenRouter) to identify headers in the document.

    Returns a list of dicts: {text, page, suggested_level, confidence}.
    The function sends plain page-line strings to the LLM (including page numbers)
    and asks for a JSON array containing only header lines with suggested levels.
    """
    doc=openPDF(pdf_path)
    api_key='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'
    if api_key is None:
       
        api_key = os.getenv("OPENROUTER_API_KEY") or None
    model=str(model)
    toc_pages = get_toc_page_numbers(doc)
    lines_for_prompt = []

    # Collect text lines from pages (skip TOC pages)
    for pno in range(len(doc)):
        if pages_to_check and pno not in pages_to_check:
            continue
        if pno in toc_pages:
            continue
        page = doc.load_page(pno)
        page_height = page.rect.height
        for block in page.get_text("dict").get('blocks', []):
            if block.get('type') != 0:
                continue
            for line in block.get('lines', []):
                spans = line.get('spans', [])
                if not spans:
                    continue
                y0 = spans[0]['bbox'][1]
                y1 = spans[0]['bbox'][3]
                if y0 < top_margin or y1 > (page_height - bottom_margin):
                    continue
                text = " ".join(s.get('text','') for s in spans).strip()
                if text:
                    # prefix with page for easier mapping back
                    lines_for_prompt.append(f"PAGE {pno+1}: {text}")

    if not lines_for_prompt:
        return []

    prompt = (
        LLM_prompt.join(lines_for_prompt)
    )

    if not api_key:
        # No API key: return empty so caller can fallback to heuristics
        return []

    url = "https://openrouter.ai/api/v1/chat/completions"

    # Build headers following the OpenRouter example
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
        "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
    }

    # Wrap the prompt as the example 'content' array expected by OpenRouter
    body = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt}
                ]
            }
        ]
    }

    # Debug: log request body (truncated) and write raw response for inspection
    try:
        print("LLM request (truncated):", prompt[:1000])
        resp = requests.post(
            url=url,
            headers=headers,
            data=json.dumps(body),

        )
        resp.raise_for_status()
        resp_text = resp.text
        print("LLM raw response length:", len(resp_text))
        # Save raw response for offline inspection
        try:
            with open("llm_debug.json", "w", encoding="utf-8") as fh:
                fh.write(resp_text)
        except Exception as e:
            print("Warning: could not write llm_debug.json:", e)
        rj = resp.json()
        print("LLM parsed response keys:", list(rj.keys()) if isinstance(rj, dict) else type(rj))
    except Exception as e:
        print("LLM call failed:", repr(e))
        return []

    # Extract textual reply robustly
    text_reply = None
    if isinstance(rj, dict):
        choices = rj.get('choices') or []
        if choices:
            c0 = choices[0]
            msg = c0.get('message') or c0.get('delta') or {}
            content = msg.get('content')
            if isinstance(content, list):
                for c in content:
                    if c.get('type') == 'text' and c.get('text'):
                        text_reply = c.get('text')
                        break
            elif isinstance(content, str):
                text_reply = content
            elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
                text_reply = msg.get('content').get('text')
    if not text_reply:
        for c in rj.get('choices', []):
            if isinstance(c.get('text'), str):
                text_reply = c.get('text')
                break

    if not text_reply:
        return []

    s = text_reply.strip()
    start = s.find('[')
    end = s.rfind(']')
    js = s[start:end+1] if start != -1 and end != -1 else s
    try:
        parsed = json.loads(js)
    except Exception:
        return []

    # Normalize parsed entries and return
    out = []
    for obj in parsed:
        t = obj.get('text')
        page = int(obj.get('page')) if obj.get('page') else None
        level = obj.get('suggested_level')
        conf = float(obj.get('confidence') or 0)
        if t and page is not None:
            out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})

    return out

# Wrapper function to convert JSON to a dataframe-friendly format
def identify_headers_with_table(pdf_path, model, LLM_prompt):
    # Call your existing function
    result = identify_headers_with_openrouter(pdf_path, model, LLM_prompt)
    
    # Convert list of dicts to list of lists for Gradio Dataframe
    if not result:
        return []  # empty table if no results
    
    table_data = [[item['text'], item['page']+1, item['suggested_level'], item['confidence']] for item in result]
    return table_data

# Column names for the table
columns = ["Text", "Page", "Suggested Level", "Confidence"]


# Gradio Interface
iface = gr.Interface(
    fn=identify_headers_with_table,
    inputs=[
        gr.Textbox(label="Document Link"),
        gr.Textbox(label="Model Type"),
        gr.Textbox(label="LLM Prompt")
    ],
    outputs=gr.Dataframe(headers=columns)
)

iface.launch()