Marthee commited on
Commit
09310e8
·
verified ·
1 Parent(s): 44130c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -423
app.py CHANGED
@@ -1,450 +1,225 @@
1
- from flask import Flask, request, jsonify, render_template, send_file, redirect, url_for, Response
2
- import tsadropboxretrieval
3
- # import findInitialMarkups
4
- import InitialMarkups
5
- import requests
6
- import fitz
7
- from io import BytesIO
8
- import datetime
9
- import time
10
- from threading import Thread
11
- from urllib.parse import quote, unquote, parse_qs
12
- # import pdftotext
13
  import json
14
- # -------------------- App & Globals --------------------
15
- app = Flask(__name__)
16
- pageNumTextFound = 0
17
- BASE_URL = "https://adr.trevorsadd.co.uk/api/testpage" ##changed this only
18
- backend_ready = False
19
- jsonoutput = [] # ensure defined before use
20
-
21
- # -------------------- Simple Health/Test --------------------
22
- @app.route("/health", methods=["GET"])
23
- def health():
24
- return jsonify(status="ok", time=datetime.datetime.now().isoformat())
25
-
26
- # -------------------- Root: keep it simple & reliable --------------------
27
- @app.route("/", methods=["GET"])
28
- def root():
29
- # Avoid missing-template errors. Keep it simple so external access works.
30
- return jsonify(message="FIND APIs root. Use /health or /testpage."), 200
31
-
32
- # -------------------- Headers Filtering Find 1 Space --------------------
33
- @app.route('/api/process-data', methods=['POST'])
34
- def process_headers():
35
- try:
36
- data = request.get_json(force=True) or {}
37
- filePath = data.get('filePath')
38
- if not filePath:
39
- return jsonify({"error": "Missing 'filePath'"}), 400
40
- headers = findInitialMarkups.headersfrompdf(filePath)
41
- return jsonify(headers)
42
- except Exception as e:
43
- print(f"Error in /api/process-data: {e}")
44
- return jsonify({"error": str(e)}), 500
45
 
46
- # -------------------- PDF to Text 1 Space --------------------
47
- @app.route('/processalltext1', methods=['POST'])
48
- def processalltextTotext():
49
- try:
50
- data = request.get_json(force=True) or {}
51
- pdfpath = data.get('filePath')
52
- if not pdfpath:
53
- return jsonify({"error": "Missing 'filePath' in request data"}), 400
54
- pdftext,filename = pdftotext.texts_from_pdfAllText(pdfpath)
55
- return jsonify({"message": "Data received", "input_data": pdftext,"Filename:":filename})
56
- except Exception as e:
57
- print(f"Error in /processalltext1: {e}")
58
- return jsonify({"error": str(e)}), 500
59
 
60
- # -------------------- Keepalive --------------------
61
- @app.route("/keepaliveapii", methods=["GET", "POST"])
62
- def keepaliveapi():
63
- try:
64
- print('Keepalive pinged')
65
- return 'alivee'
66
- except Exception as error:
67
- print('Error in keepalive:', error)
68
- return jsonify(status="error", message=str(error)), 500
69
-
70
- # -------------------- View PDF (Marked up) --------------------
71
- def getpdfcontent(pdf_path):
72
- # Handle Dropbox URLs
73
- if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
74
- pdf_path = pdf_path.replace('dl=0', 'dl=1')
75
-
76
- # Get the PDF bytes
77
- response = requests.get(pdf_path)
78
- pdf_bytes = response.content
79
 
80
- if not pdf_bytes or not pdf_bytes.startswith(b"%PDF"):
81
- raise ValueError("No valid PDF content found.")
82
 
83
- # Return a BytesIO stream
84
- return BytesIO(pdf_bytes)
85
 
 
 
86
 
87
- @app.route('/view-pdf', methods=['GET'])
88
- def view_pdf():
89
- encoded_pdf_link = request.args.get('pdfLink')
90
- if not encoded_pdf_link:
91
- return "Missing pdfLink parameter.", 400
92
 
93
- pdf_link = unquote(encoded_pdf_link)
94
- print("Extracted PDF Link:", pdf_link)
 
95
 
96
- try:
97
- pdf_content = getpdfcontent(pdf_link)
98
- except Exception as e:
99
- print("Error during PDF extraction:", e)
100
- return "PDF could not be processed.", 500
101
-
102
- if pdf_content is None:
103
- return "PDF content not found or broken.", 404
104
-
105
- # ✅ Do NOT wrap again in BytesIO
106
- return send_file(
107
- pdf_content,
108
- mimetype='application/pdf',
109
- as_attachment=False,
110
- download_name="annotated_page.pdf"
111
- )
112
 
113
- # -------------------- Process PDF -> Upload to Dropbox (renamed to avoid duplicate route) --------------------
114
- @app.route('/api/process-pdf', methods=['POST'])
115
- def process_pdf_and_upload():
116
- try:
117
- data = request.get_json(force=True) or {}
118
- pdfLink = data.get('filePath')
119
- if not pdfLink:
120
- return jsonify({"error": "'filePath' must be provided."}), 400
121
-
122
- print("Processing PDF:", pdfLink)
123
- pdfbytes, pdf_document, tablepdfoutput = InitialMarkups.extract_section_under_header(pdfLink)
124
-
125
- dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
126
- metadata = dbxTeam.sharing_get_shared_link_metadata(pdfLink)
127
-
128
- dbPath = '/TSA JOBS/ADR Test/FIND/'
129
- pdflink = tsadropboxretrieval.uploadanyFile(doc=pdf_document, path=dbPath, pdfname=metadata.name)
130
- tablepdfLink = tsadropboxretrieval.uploadanyFile(
131
- doc=tablepdfoutput,
132
- path=dbPath,
133
- pdfname=metadata.name.rsplit(".pdf", 1)[0] + ' Markup Summary.pdf'
134
- )
135
- print('Uploaded:', pdflink, tablepdfLink)
136
 
137
- return jsonify({
138
- "message": "PDF processed successfully.",
139
- "PDF_MarkedUp": pdflink,
140
- "Table_PDF_Markup_Summary": tablepdfLink
141
- })
142
- except Exception as e:
143
- print(f"Error in /api/process-pdf: {e}")
144
- return jsonify({"error": str(e)}), 500
145
 
146
- # -------------------- Not billed / Markup subsets --------------------
147
- @app.route('/findapitobebilled1', methods=['GET','POST'])
148
- def findapitobebilled1():
149
- try:
150
- data = request.get_json(force=True) or {}
151
- pdfLink = data.get('filePath')
152
- if not pdfLink:
153
- return jsonify({"error": "Missing 'filePath'"}), 400
154
- pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled , filename = InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
155
-
156
- return jsonify(alltext_tobebilled)
157
- except Exception as e:
158
- print(f"Error in /findapitobebilled1: {e}")
159
- return jsonify({"error": str(e)}), 500
160
-
161
 
162
- # ----------------------------------------------------------------------
163
- @app.route('/findapitobebilled_htmlformat', methods=['GET','POST'])
164
- def findapitobebilled_htmlformat():
165
- try:
166
- data = request.get_json(force=True) or {}
167
- pdfLink = data.get('filePath')
168
- if not pdfLink:
169
- return jsonify({"error": "Missing 'filePath'"}), 400
170
- pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled , filename = InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
171
- # Parse JSON string → list of dicts
172
- data = json.loads(tablepdfoutput)
173
-
174
- # Collect all body parts
175
- html_body = ""
176
-
177
- for section in data:
178
- if "head above 2" in section:
179
- html_body += f"<h1>{section['head above 2']}</h1><br>"
180
-
181
- if "head above 1" in section:
182
- html_body += f"<h2>{section['head above 1']}</h2><br>"
183
-
184
- if "Subject" in section:
185
- html_body += f"<h3>{section['Subject']}</h3><br>"
186
- if "BodyText" in section:
187
- html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
188
- # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
189
-
190
- # Wrap everything into one HTML document
191
- html_content = f"""
192
- <!DOCTYPE html>
193
- <html>
194
- <head>
195
- <title>{filename}</title>
196
- <meta charset="utf-8">
197
- </head>
198
- <body>
199
- {html_body}
200
- </body>
201
- </html>
202
- """
203
- # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
204
- return jsonify({"input_data": html_content,"Filename:":filename})
205
- except Exception as e:
206
- print(f"Error in /findapitobebilled_htmlformat: {e}")
207
- return jsonify({"error": str(e)}), 500
208
 
 
 
 
 
 
 
 
 
209
 
210
- @app.route('/view-pdf-tobebilled', methods=['GET'])
211
- def view_pdf_tobebilled():
212
- encoded_pdf_link = request.args.get('pdfLink')
213
- if not encoded_pdf_link:
214
- return "Missing pdfLink parameter.", 400
215
- pdf_link = unquote(encoded_pdf_link)
216
- print("Extracted PDF Link:", pdf_link)
217
- try:
218
- pdf_content = InitialMarkups.extract_section_under_header_tobebilledOnly(pdf_link)[0]
219
- except Exception as e:
220
- print("Error during PDF extraction:", e)
221
- return "PDF could not be processed.", 500
222
- if pdf_content is None or not pdf_content.startswith(b"%PDF"):
223
- return "PDF content not found or broken.", 404
224
- return send_file(
225
- BytesIO(pdf_content),
226
- mimetype='application/pdf',
227
- as_attachment=False,
228
- download_name=f"annotated_page_{pageNumTextFound}.pdf"
229
- )
230
 
231
- # -------------------- Final markups: view one highlight --------------------
232
- @app.route('/view-highlight', methods=['GET','POST'])
233
- def download_pdfHighlight():
234
- pdf_link = request.args.get('pdfLink')
235
- keyword = request.args.get('keyword')
236
- if not pdf_link or not keyword:
237
- return "Missing required parameters.", 400
238
-
239
- pdf_link = unquote(pdf_link)
240
- print("Extracted PDF Link:", pdf_link)
241
- print("Extracted Keyword:", keyword)
242
-
243
- global jsonoutput
244
- matching_item = next((item for item in jsonoutput if item.get("Subject") == keyword), None)
245
-
246
- if matching_item:
247
- page_number = int(matching_item.get("Page")) - 1
248
- stringtowrite = matching_item.get("head above 1")
249
- print(f"Page number for '{keyword}': {page_number}")
250
- else:
251
- page_number = 0
252
- stringtowrite = None
253
- print("No match found in jsonoutput; defaulting to page 0.")
254
-
255
- pdf_content = InitialMarkups.extract_section_under_headerRawan(pdf_link, keyword, page_number, stringtowrite)[0]
256
- if pdf_content is None:
257
- return "PDF content not found.", 404
258
-
259
- return send_file(
260
- BytesIO(pdf_content),
261
- mimetype='application/pdf',
262
- as_attachment=False,
263
- download_name=f"annotated_page_{pageNumTextFound}.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  )
265
 
266
- @app.route('/findapiFilteredHeadings', methods=['GET','POST'])
267
- def findapiFilteredHeadings():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  try:
269
- data = request.get_json(force=True) or {}
270
- pdfLink = data.get('filePath')
271
- listofheadings = data.get('listofheadings') # json array
272
- if not pdfLink or listofheadings is None:
273
- return jsonify({"error": "Missing 'filePath' or 'listofheadings'"}), 400
274
-
275
- pdfbytes, pdf_document, tablepdfoutput, alltext = InitialMarkups.extract_section_under_headerRawan(pdfLink, listofheadings)
276
- global jsonoutput
277
- jsonoutput = tablepdfoutput
278
- return jsonify(alltext)
279
- except Exception as e:
280
- print(f"Error in /findapiFilteredHeadings: {e}")
281
- return jsonify({"error": str(e)}), 500
282
 
283
- @app.route('/findapitobebilledonlyNew', methods=['GET','POST'])
284
- def findapitobebilledonly():
285
- try:
286
- data = request.get_json(force=True) or {}
287
- pdfLink = data.get('filePath')
288
- if not pdfLink:
289
- return jsonify({"error": "Missing 'filePath'"}), 400
290
- pdfbytes, pdf_document, tablepdfoutput, alltext , filename= InitialMarkups.extract_section_under_header_tobebilled2(pdfLink)
291
- # return jsonify(tablepdfoutput)
292
- # Parse JSON string list of dicts
293
- data = json.loads(tablepdfoutput)
294
-
295
- # Collect all body parts
296
- html_body = ""
297
-
298
- for section in data:
299
- if "head above 2" in section:
300
- html_body += f"<h1>{section['head above 2']}</h1><br>"
301
-
302
- if "head above 1" in section:
303
- html_body += f"<h2>{section['head above 1']}</h2><br>"
304
-
305
- if "Subject" in section:
306
- html_body += f"<h3>{section['Subject']}</h3><br>"
307
- if "BodyText" in section:
308
- html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
309
- # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
310
-
311
- # Wrap everything into one HTML document
312
- html_content = f"""
313
- <!DOCTYPE html>
314
- <html>
315
- <head>
316
- <title>{filename}</title>
317
- <meta charset="utf-8">
318
- </head>
319
- <body>
320
- {html_body}
321
- </body>
322
- </html>
323
- """
324
- # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
325
- return jsonify({"input_data": html_content,"Filename:":filename})
326
- # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
327
  except Exception as e:
328
- print(f"Error in /findapitobebilledonly: {e}")
329
- return jsonify({"error": str(e)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
 
 
 
 
 
 
 
 
 
331
 
 
332
 
333
- @app.route('/findapitobebilledonlyNewMultiplePDFS', methods=['GET','POST'])
334
- def findapitobebilledonlymarthe():
335
- try:
336
- data = request.get_json(force=True) or {}
337
- pdfLink = data.get('filePath')
338
- if not pdfLink:
339
- return jsonify({"error": "Missing 'filePath'"}), 400
340
- pdfbytes, pdf_document, tablepdfoutput, alltext , filename= InitialMarkups.extract_section_under_header_tobebilledMultiplePDFS(pdfLink)
341
- # return jsonify(tablepdfoutput)
342
- # Parse JSON string → list of dicts
343
- if isinstance(tablepdfoutput, str):
344
- data = json.loads(tablepdfoutput)
345
- else:
346
- data = tablepdfoutput
347
- # Collect all body parts
348
- html_body = ""
349
-
350
- for section in data:
351
- if "head above 2" in section:
352
- html_body += f"<h1>{section['head above 2']}</h1><br>"
353
-
354
- if "head above 1" in section:
355
- html_body += f"<h2>{section['head above 1']}</h2><br>"
356
-
357
- if "Subject" in section:
358
- html_body += f"<h3>{section['Subject']}</h3><br>"
359
- if "BodyText" in section:
360
- html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
361
- # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
362
-
363
- # Wrap everything into one HTML document
364
- html_content = f"""
365
- <!DOCTYPE html>
366
- <html>
367
- <head>
368
- <title>{filename}</title>
369
- <meta charset="utf-8">
370
- </head>
371
- <body>
372
- {html_body}
373
- </body>
374
- </html>
375
- """
376
- # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
377
- return jsonify({"input_data": html_content,"Filename:":filename})
378
- # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
379
- except Exception as e:
380
- print(f"Error in /findapitobebilledonly: {e}")
381
- return jsonify({"error": str(e)}), 500
382
 
383
 
384
- @app.route('/findapiAllDocNoNotbilled', methods=['GET','POST'])
385
- def findapiAllDocNoNotbilled():
386
- try:
387
- data = request.get_json(force=True) or {}
388
- pdfLink = data.get('filePath')
389
- if not pdfLink:
390
- return jsonify({"error": "Missing 'filePath'"}), 400
391
- pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled ,filename= InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
392
- return jsonify(alltextNoNotbilled)
393
- except Exception as e:
394
- print(f"Error in /findapiAllDocNoNotbilled: {e}")
395
- return jsonify({"error": str(e)}), 500
396
 
397
- # -------------------- Rawan - MC Connection --------------------
398
- @app.route('/findapi', methods=['GET','POST'])
399
- def findapi():
400
- try:
401
- data = request.get_json(force=True) or {}
402
- pdfLink = data.get('filePath')
403
- if not pdfLink:
404
- return jsonify({"error": "Missing 'filePath'"}), 400
405
-
406
- pdfbytes, pdf_document, tablepdfoutput = InitialMarkups.extract_section_under_header(pdfLink)
407
- global jsonoutput
408
- jsonoutput = tablepdfoutput
409
- return jsonify(tablepdfoutput)
410
- except Exception as e:
411
- print(f"Error in /findapi: {e}")
412
- return jsonify({"error": str(e)}), 500
413
-
414
- #--------------------testpage-----------------------------
415
- import socket
416
- from datetime import datetime
417
-
418
- @app.route('/testpage')
419
- def test_page():
420
- # Get some system info
421
- hostname = socket.gethostname()
422
- current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
423
-
424
- return f"""
425
- <!DOCTYPE html>
426
- <html>
427
- <head>
428
- <title>Server Test Page</title>
429
- <style>
430
- body {{ font-family: Arial, sans-serif; text-align: center; margin-top: 50px; }}
431
- .success {{ color: #2ecc71; font-size: 24px; }}
432
- .info {{ color: #34495e; margin-top: 10px; }}
433
- .container {{ max-width: 600px; margin: 0 auto; text-align: left; }}
434
- </style>
435
- </head>
436
- <body>
437
- <div class="success">🚀 Flask Server is Running!</div>
438
- <div class="container">
439
- <p class="info"><strong>Hostname:</strong> {hostname}</p>
440
- <p class="info"><strong>Server Time:</strong> {current_time}</p>
441
- <p class="info"><strong>Endpoint:</strong> /testpage</p>
442
- <p class="info"><strong>Status:</strong> <span style="color: #2ecc71;">Operational ✅</span></p>
443
- </div>
444
- </body>
445
- </html>
446
- """
447
-
448
- # -------------------- Run --------------------
449
- if __name__ == "__main__":
450
- app.run(host="0.0.0.0", port=5000, debug=True)
 
1
+ import gradio as gr
2
+ import os
 
 
 
 
 
 
 
 
 
 
3
  import json
4
+ import requests
5
+ from io import BytesIO
6
+ import fitz # PyMuPDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
10
 
11
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
12
+ toc_pages = []
13
 
14
+ # 1. Existing Dot Pattern (looking for ".....")
15
+ dot_pattern = re.compile(r"\.{2,}")
16
 
17
+ # 2. NEW: Title Pattern (looking for specific headers)
18
+ # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
19
+ # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
20
+ title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
 
21
 
22
+ for page_num in range(min(len(doc), max_pages_to_check)):
23
+ page = doc.load_page(page_num)
24
+ blocks = page.get_text("dict")["blocks"]
25
 
26
+ dot_line_count = 0
27
+ has_toc_title = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ for block in blocks:
30
+ for line in block.get("lines", []):
31
+ # Extract text from spans (mimicking get_spaced_text_from_spans)
32
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # CHECK A: Does the line have dots?
35
+ if dot_pattern.search(line_text):
36
+ dot_line_count += 1
 
 
 
 
 
37
 
38
+ # CHECK B: Is this line a Title?
39
+ # We check this early in the loop. If a page has a title "Contents",
40
+ # we mark it immediately.
41
+ if title_pattern.match(line_text):
42
+ has_toc_title = True
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # CONDITION:
45
+ # It is a TOC page if it has a Title OR if it has dot leaders.
46
+ # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
47
+ if has_toc_title or dot_line_count >= 1:
48
+ toc_pages.append(page_num)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # RETURN:
51
+ # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
52
+ # This covers the cover page, inside cover, and the TOC itself.
53
+ if toc_pages:
54
+ last_toc_page = toc_pages[0]
55
+ return list(range(0, last_toc_page + 1))
56
+
57
+ return [] # Return empty list if nothing found
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ def openPDF(pdf_path):
61
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
62
+ response = requests.get(pdf_path)
63
+ pdf_content = BytesIO(response.content)
64
+ if not pdf_content:
65
+ raise ValueError("No valid PDF content found.")
66
+
67
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
68
+ return doc
69
+
70
+ def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=70, bottom_margin=85):
71
+ """Ask an LLM (OpenRouter) to identify headers in the document.
72
+
73
+ Returns a list of dicts: {text, page, suggested_level, confidence}.
74
+ The function sends plain page-line strings to the LLM (including page numbers)
75
+ and asks for a JSON array containing only header lines with suggested levels.
76
+ """
77
+ doc=openPDF(pdf_path)
78
+ api_key='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'
79
+ if api_key is None:
80
+
81
+ api_key = os.getenv("OPENROUTER_API_KEY") or None
82
+
83
+ toc_pages = get_toc_page_numbers(doc)
84
+ lines_for_prompt = []
85
+
86
+ # Collect text lines from pages (skip TOC pages)
87
+ for pno in range(len(doc)):
88
+ if pages_to_check and pno not in pages_to_check:
89
+ continue
90
+ if pno in toc_pages:
91
+ continue
92
+ page = doc.load_page(pno)
93
+ page_height = page.rect.height
94
+ for block in page.get_text("dict").get('blocks', []):
95
+ if block.get('type') != 0:
96
+ continue
97
+ for line in block.get('lines', []):
98
+ spans = line.get('spans', [])
99
+ if not spans:
100
+ continue
101
+ y0 = spans[0]['bbox'][1]
102
+ y1 = spans[0]['bbox'][3]
103
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
104
+ continue
105
+ text = " ".join(s.get('text','') for s in spans).strip()
106
+ if text:
107
+ # prefix with page for easier mapping back
108
+ lines_for_prompt.append(f"PAGE {pno+1}: {text}")
109
+
110
+ if not lines_for_prompt:
111
+ return []
112
+
113
+ prompt = (
114
+ LLM_prompt.join(lines_for_prompt)
115
  )
116
 
117
+ if not api_key:
118
+ # No API key: return empty so caller can fallback to heuristics
119
+ return []
120
+
121
+ url = "https://openrouter.ai/api/v1/chat/completions"
122
+
123
+ # Build headers following the OpenRouter example
124
+ headers = {
125
+ "Authorization": f"Bearer {api_key}",
126
+ "Content-Type": "application/json",
127
+ "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
128
+ "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
129
+ }
130
+
131
+ # Wrap the prompt as the example 'content' array expected by OpenRouter
132
+ body = {
133
+ "model": model,
134
+ "messages": [
135
+ {
136
+ "role": "user",
137
+ "content": [
138
+ {"type": "text", "text": prompt}
139
+ ]
140
+ }
141
+ ]
142
+ }
143
+
144
+ # Debug: log request body (truncated) and write raw response for inspection
145
  try:
146
+ print("LLM request (truncated):", prompt[:1000])
147
+ resp = requests.post(
148
+ url=url,
149
+ headers=headers,
150
+ data=json.dumps(body),
 
 
 
 
 
 
 
 
151
 
152
+ )
153
+ resp.raise_for_status()
154
+ resp_text = resp.text
155
+ print("LLM raw response length:", len(resp_text))
156
+ # Save raw response for offline inspection
157
+ try:
158
+ with open("llm_debug.json", "w", encoding="utf-8") as fh:
159
+ fh.write(resp_text)
160
+ except Exception as e:
161
+ print("Warning: could not write llm_debug.json:", e)
162
+ rj = resp.json()
163
+ print("LLM parsed response keys:", list(rj.keys()) if isinstance(rj, dict) else type(rj))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  except Exception as e:
165
+ print("LLM call failed:", repr(e))
166
+ return []
167
+
168
+ # Extract textual reply robustly
169
+ text_reply = None
170
+ if isinstance(rj, dict):
171
+ choices = rj.get('choices') or []
172
+ if choices:
173
+ c0 = choices[0]
174
+ msg = c0.get('message') or c0.get('delta') or {}
175
+ content = msg.get('content')
176
+ if isinstance(content, list):
177
+ for c in content:
178
+ if c.get('type') == 'text' and c.get('text'):
179
+ text_reply = c.get('text')
180
+ break
181
+ elif isinstance(content, str):
182
+ text_reply = content
183
+ elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
184
+ text_reply = msg.get('content').get('text')
185
+ if not text_reply:
186
+ for c in rj.get('choices', []):
187
+ if isinstance(c.get('text'), str):
188
+ text_reply = c.get('text')
189
+ break
190
+
191
+ if not text_reply:
192
+ return []
193
+
194
+ s = text_reply.strip()
195
+ start = s.find('[')
196
+ end = s.rfind(']')
197
+ js = s[start:end+1] if start != -1 and end != -1 else s
198
+ try:
199
+ parsed = json.loads(js)
200
+ except Exception:
201
+ return []
202
 
203
+ # Normalize parsed entries and return
204
+ out = []
205
+ for obj in parsed:
206
+ t = obj.get('text')
207
+ page = int(obj.get('page')) if obj.get('page') else None
208
+ level = obj.get('suggested_level')
209
+ conf = float(obj.get('confidence') or 0)
210
+ if t and page is not None:
211
+ out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
212
 
213
+ return out
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
+ iface = gr.Interface(
218
+ fn=identify_headers_with_openrouter,
219
+ inputs=[gr.Textbox(label="Document Link"),
220
+ gr.Textbox(label="Model Type"),
221
+ gr.Textbox(label="LLM Prompt")]
222
+ outputs=gr.Textbox(label="Output")
223
+ )
 
 
 
 
 
224
 
225
+ iface.launch()