Marthee commited on
Commit
d681c26
·
verified ·
1 Parent(s): 212ec1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -217
app.py CHANGED
@@ -125,13 +125,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
125
  api_key = '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'
126
  if api_key is None:
127
  api_key = os.getenv("OPENROUTER_API_KEY") or None
128
-
129
  model = str(model)
130
  toc_pages = get_toc_page_numbers(doc)
131
  lines_for_prompt = []
132
-
133
  logger.info(f"TOC pages to skip: {toc_pages}")
134
- logger.info(f"Total pages in document: {len(doc)}")
135
 
136
  # Collect text lines from pages (skip TOC pages)
137
  total_lines = 0
@@ -145,30 +144,46 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
145
  page = doc.load_page(pno)
146
  page_height = page.rect.height
147
  lines_on_page = 0
148
-
149
- for block in page.get_text("dict").get('blocks', []):
150
- if block.get('type') != 0:
 
 
151
  continue
152
- for line in block.get('lines', []):
153
- spans = line.get('spans', [])
154
- if not spans:
155
- continue
156
- y0 = spans[0]['bbox'][1]
157
- y1 = spans[0]['bbox'][3]
158
- # if y0 < top_margin or y1 > (page_height - bottom_margin):
159
- # continue
160
- for s in spans:
161
- # text,font,size,flags,color
162
- ArrayofTextWithFormat={s.get('text')}
163
-
164
- # prefix with page for easier mapping back
165
- lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- # text = " ".join(s.get('text','') for s in spans).strip()
168
- # if text:
169
- # # prefix with page for easier mapping back
170
- # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
171
- lines_on_page += 1
172
 
173
  if lines_on_page > 0:
174
  logger.debug(f"Page {pno}: collected {lines_on_page} lines")
@@ -185,7 +200,8 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
185
  for i, line in enumerate(lines_for_prompt[:10]):
186
  logger.info(f" {i}: {line}")
187
 
188
- prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
 
189
 
190
  logger.debug(f"Full prompt length: {len(prompt)} characters")
191
  # Changed: Print entire prompt, not truncated
@@ -393,207 +409,26 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
393
 
394
 
395
  def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
396
- logger.info("=" * 80)
397
- logger.info("STARTING IDENTIFY_HEADERS_AND_SAVE_EXCEL")
398
- logger.info(f"Inputs - PDF: {pdf_path}, Model: {model}")
399
-
400
  # Call your existing function
401
  result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
402
 
403
  if not result:
404
- logger.warning("No results returned from identify_headers_with_openrouter")
405
- return None
406
-
407
- logger.info(f"Got {len(result)} results, creating DataFrame")
408
- import json
409
- import requests
410
- from io import BytesIO
411
- import gradio as gr
412
- import pandas as pd
413
- from io import BytesIO
414
- import fitz # PyMuPDF
415
-
416
- from urllib.parse import urlparse, unquote
417
- import os
418
- from io import BytesIO
419
- import re
420
- import requests
421
- import pandas as pd
422
- import fitz # PyMuPDF
423
- import re
424
- import urllib.parse
425
- import difflib
426
- from fuzzywuzzy import fuzz
427
- import copy
428
- # import tsadropboxretrieval
429
-
430
- import urllib.parse
431
- import logging
432
-
433
- # Set up logging to see everything
434
-
435
- def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
436
- """Ask an LLM (OpenRouter) to identify headers in the document.
437
- Returns a list of dicts: {text, page, suggested_level, confidence}.
438
- The function sends plain page-line strings to the LLM (including page numbers)
439
- and asks for a JSON array containing only header lines with suggested levels.
440
- """
441
- logger.info("=" * 80)
442
- logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
443
- y1 = spans[0]['bbox'][3]
444
- # if y0 < top_margin or y1 > (page_height - bottom_margin):
445
- # continue
446
- for s in spans:
447
- # text,font,size,flags,color
448
- ArrayofTextWithFormat={s.get('text')}
449
-
450
- # prefix with page for easier mapping back
451
- lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
452
-
453
- # text = " ".join(s.get('text','') for s in spans).strip()
454
- # if text:
455
- # # prefix with page for easier mapping back
456
- # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
457
- lines_on_page += 1
458
-
459
- if lines_on_page > 0:
460
- prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
461
-
462
- logger.debug(f"Full prompt length: {len(prompt)} characters")
463
- # Changed: Print entire prompt, not truncated
464
- print("=" * 80)
465
- print("FULL LLM PROMPT:")
466
- print(prompt)
467
- logger.error(f"Could not save prompt to file: {e}")
468
-
469
- if not api_key:
470
- # No API key: return empty so caller can fallback to heuristics
471
- logger.error("No API key provided")
472
- return []
473
-
474
- url = "https://openrouter.ai/api/v1/chat/completions"
475
-
476
- # Build headers following the OpenRouter example
477
- headers = {
478
- "Authorization": f"Bearer {api_key}",
479
- "Content-Type": "application/json",
480
- "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
481
- }
482
-
483
- # Log request details (without exposing full API key)
484
- logger.info(f"Making request to OpenRouter with model: {model}")
485
- logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
486
-
487
- # Wrap the prompt as the example 'content' array expected by OpenRouter
488
- body = {
489
- "model": model,
490
- "messages": [
491
- ]
492
- }
493
-
494
- # Debug: log request body (truncated) and write raw response for inspection
495
- try:
496
- # Changed: Log full body (excluding prompt text which is already logged)
497
- logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
498
-
499
- # Removed timeout parameter
500
- resp = requests.post(
501
- url=url,
502
- headers=headers,
503
- resp.raise_for_status()
504
-
505
- resp_text = resp.text
506
- # Changed: Print entire response
507
- print("=" * 80)
508
- print("FULL LLM RESPONSE:")
509
- print(resp_text)
510
-
511
- logger.info(f"LLM raw response length: {len(resp_text)}")
512
-
513
- # Save raw response for offline inspection
514
- try:
515
- with open("llm_debug.json", "w", encoding="utf-8") as fh:
516
- fh.write(resp_text)
517
-
518
- if not text_reply:
519
- logger.error("Could not extract text reply from response")
520
- # Changed: Print the entire response structure for debugging
521
- print("=" * 80)
522
- print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
523
- print(json.dumps(rj, indent=2))
524
- print("=" * 80)
525
- return []
526
 
527
- # Changed: Print the extracted text reply
528
- print("=" * 80)
529
- print("EXTRACTED TEXT REPLY:")
530
- print(text_reply)
531
- except json.JSONDecodeError as e:
532
- logger.error(f"Failed to parse JSON: {e}")
533
- logger.error(f"JSON string that failed to parse: {js[:1000]}")
534
- # Try to find any JSON-like structure
535
- try:
536
- # Try to extract any JSON array
537
- import re
538
- json_pattern = r'\[\s*\{.*?\}\s*\]'
539
- matches = re.findall(json_pattern, text_reply, re.DOTALL)
540
 
541
- # Log parsed results
542
- logger.info(f"Parsed {len(parsed)} header items:")
543
- for i, obj in enumerate(parsed[:10]): # Log first 10 items
544
- logger.info(f" Item {i}: {obj}")
545
-
546
- # Normalize parsed entries and return
547
- page = int(obj.get('page')) if obj.get('page') else None
548
- level = obj.get('suggested_level')
549
- conf = float(obj.get('confidence') or 0)
550
-
551
-
552
- if t and page is not None:
553
- out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
554
-
555
-
556
-
557
-
558
-
559
-
560
-
561
-
562
-
563
-
564
-
565
-
566
-
567
-
568
- logger.info(f"Returning {len(out)} valid header entries")
569
- return out
570
-
571
 
572
- logger.info("DataFrame head:")
573
- logger.info(df.head().to_string())
574
 
 
575
 
576
-
577
-
578
-
579
-
580
- # Save Excel to a file on disk
581
- output_path = "output.xlsx"
582
- try:
583
- df.to_excel(output_path, index=False, engine='openpyxl')
584
- logger.info(f"Excel file saved successfully to: {output_path}")
585
-
586
- # Verify file was created
587
- if os.path.exists(output_path):
588
- file_size = os.path.getsize(output_path)
589
- logger.info(f"Output file exists, size: {file_size} bytes")
590
  gr.Textbox(label="LLM Prompt")
591
  ],
592
- outputs = gr.File(file_count="single", label="Download Excel")
593
-
594
  )
595
 
596
- if __name__ == "__main__":
597
- print("Starting Gradio interface...")
598
- logger.info("Launching Gradio interface")
599
- iface.launch()
 
125
  api_key = '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'
126
  if api_key is None:
127
  api_key = os.getenv("OPENROUTER_API_KEY") or None
 
128
  model = str(model)
129
  toc_pages = get_toc_page_numbers(doc)
130
  lines_for_prompt = []
131
+ pgestoRun=20
132
  logger.info(f"TOC pages to skip: {toc_pages}")
133
+ logger.info(f"Total pages in document: {pgestoRun}")
134
 
135
  # Collect text lines from pages (skip TOC pages)
136
  total_lines = 0
 
144
  page = doc.load_page(pno)
145
  page_height = page.rect.height
146
  lines_on_page = 0
147
+ text_dict = page.get_text("dict")
148
+ lines = []
149
+ y_tolerance = 0.2 # tweak if needed (1–3 usually works)
150
+ for block in text_dict["blocks"]:
151
+ if block["type"] != 0:
152
  continue
153
+ for line in block["lines"]:
154
+ for span in line["spans"]:
155
+ text = span["text"].strip()
156
+ if not text:
157
+ continue
158
+ x0, y0, x1, y1 = span["bbox"]
159
+ matched = False
160
+ for l in lines:
161
+ if abs(l["y"] - y0) <= y_tolerance:
162
+ l["spans"].append((x0, text))
163
+ matched = True
164
+ break
165
+ if not matched:
166
+ lines.append({
167
+ "y": y0,
168
+ "spans": [(x0, text)]
169
+ })
170
+ lines.sort(key=lambda l: l["y"])
171
+
172
+ # Join text inside each line
173
+ final_lines = []
174
+ for l in lines:
175
+ l["spans"].sort(key=lambda s: s[0]) # left → right
176
+ line_text = " ".join(text for _, text in l["spans"])
177
+ final_lines.append(line_text)
178
+
179
+ # Result
180
+ for line in final_lines:
181
+
182
+ if text:
183
+ # prefix with page for easier mapping back
184
+ lines_for_prompt.append(f"PAGE {pno+1}: {line}")
185
+ lines_on_page += 1
186
 
 
 
 
 
 
187
 
188
  if lines_on_page > 0:
189
  logger.debug(f"Page {pno}: collected {lines_on_page} lines")
 
200
  for i, line in enumerate(lines_for_prompt[:10]):
201
  logger.info(f" {i}: {line}")
202
 
203
+ prompt = "\n\nLines:\n" + "\n".join(lines_for_prompt)
204
+
205
 
206
  logger.debug(f"Full prompt length: {len(prompt)} characters")
207
  # Changed: Print entire prompt, not truncated
 
409
 
410
 
411
  def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
 
 
 
 
412
  # Call your existing function
413
  result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
414
 
415
  if not result:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
+ df = pd.DataFrame(result)
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ # Save Excel to a file on disk
420
+ output_path = "output.xlsx"
421
+ df.to_excel(output_path, index=False, engine='openpyxl')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
 
 
423
 
424
+ return output_path # return file path, not BytesIO
425
 
426
+ iface = gr.Interface(
427
+ fn=identify_headers_and_save_excel,
428
+ gr.Textbox(label="Model Type"),
 
 
 
 
 
 
 
 
 
 
 
429
  gr.Textbox(label="LLM Prompt")
430
  ],
431
+ outputs=gr.File(label="Download Excel") # File expects a path
 
432
  )
433
 
434
+ iface.launch()