Marthee commited on
Commit
a966ccd
·
verified ·
1 Parent(s): 7c121c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -18
app.py CHANGED
@@ -405,14 +405,178 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
405
  return None
406
 
407
  logger.info(f"Got {len(result)} results, creating DataFrame")
408
- df = pd.DataFrame(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
- # Log DataFrame info
411
- logger.info(f"DataFrame shape: {df.shape}")
412
- logger.info(f"DataFrame columns: {df.columns.tolist()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  logger.info("DataFrame head:")
414
  logger.info(df.head().to_string())
415
 
 
 
 
 
 
416
  # Save Excel to a file on disk
417
  output_path = "output.xlsx"
418
  try:
@@ -423,20 +587,6 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
423
  if os.path.exists(output_path):
424
  file_size = os.path.getsize(output_path)
425
  logger.info(f"Output file exists, size: {file_size} bytes")
426
- else:
427
- logger.error(f"Output file was not created at: {output_path}")
428
-
429
- except Exception as e:
430
- logger.error(f"Failed to save Excel file: {e}")
431
- return None
432
-
433
- return output_path # return file path, not BytesIO
434
-
435
- iface = gr.Interface(
436
- fn=identify_headers_and_save_excel,
437
- inputs=[
438
- gr.Textbox(label="Document Link"),
439
- gr.Textbox(label="Model Type"),
440
  gr.Textbox(label="LLM Prompt")
441
  ],
442
  outputs = gr.File(file_count="single", label="Download Excel")
 
405
  return None
406
 
407
  logger.info(f"Got {len(result)} results, creating DataFrame")
408
+ import json
409
+ import requests
410
+ from io import BytesIO
411
+ import gradio as gr
412
+ import pandas as pd
413
+ from io import BytesIO
414
+ import fitz # PyMuPDF
415
+
416
+ from urllib.parse import urlparse, unquote
417
+ import os
418
+ from io import BytesIO
419
+ import re
420
+ import requests
421
+ import pandas as pd
422
+ import fitz # PyMuPDF
423
+ import re
424
+ import urllib.parse
425
+ import difflib
426
+ from fuzzywuzzy import fuzz
427
+ import copy
428
+ # import tsadropboxretrieval
429
+
430
+ import urllib.parse
431
+ import logging
432
+
433
+ # Set up logging to see everything
434
+
435
+ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
436
+ """Ask an LLM (OpenRouter) to identify headers in the document.
437
+ Returns a list of dicts: {text, page, suggested_level, confidence}.
438
+ The function sends plain page-line strings to the LLM (including page numbers)
439
+ and asks for a JSON array containing only header lines with suggested levels.
440
+ """
441
+ logger.info("=" * 80)
442
+ logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
443
+ y1 = spans[0]['bbox'][3]
444
+ # if y0 < top_margin or y1 > (page_height - bottom_margin):
445
+ # continue
446
+ for s in spans:
447
+ # text,font,size,flags,color
448
+ ArrayofTextWithFormat={s.get('text')}
449
+
450
+ # prefix with page for easier mapping back
451
+ lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
452
+
453
+ # text = " ".join(s.get('text','') for s in spans).strip()
454
+ # if text:
455
+ # # prefix with page for easier mapping back
456
+ # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
457
+ lines_on_page += 1
458
+
459
+ if lines_on_page > 0:
460
+ prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
461
+
462
+ logger.debug(f"Full prompt length: {len(prompt)} characters")
463
+ # Changed: Print entire prompt, not truncated
464
+ print("=" * 80)
465
+ print("FULL LLM PROMPT:")
466
+ print(prompt)
467
+ logger.error(f"Could not save prompt to file: {e}")
468
 
469
+ if not api_key:
470
+ # No API key: return empty so caller can fallback to heuristics
471
+ logger.error("No API key provided")
472
+ return []
473
+
474
+ url = "https://openrouter.ai/api/v1/chat/completions"
475
+
476
+ # Build headers following the OpenRouter example
477
+ headers = {
478
+ "Authorization": f"Bearer {api_key}",
479
+ "Content-Type": "application/json",
480
+ "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
481
+ }
482
+
483
+ # Log request details (without exposing full API key)
484
+ logger.info(f"Making request to OpenRouter with model: {model}")
485
+ logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
486
+
487
+ # Wrap the prompt as the example 'content' array expected by OpenRouter
488
+ body = {
489
+ "model": model,
490
+ "messages": [
491
+ ]
492
+ }
493
+
494
+ # Debug: log request body (truncated) and write raw response for inspection
495
+ try:
496
+ # Changed: Log full body (excluding prompt text which is already logged)
497
+ logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
498
+
499
+ # Removed timeout parameter
500
+ resp = requests.post(
501
+ url=url,
502
+ headers=headers,
503
+ resp.raise_for_status()
504
+
505
+ resp_text = resp.text
506
+ # Changed: Print entire response
507
+ print("=" * 80)
508
+ print("FULL LLM RESPONSE:")
509
+ print(resp_text)
510
+
511
+ logger.info(f"LLM raw response length: {len(resp_text)}")
512
+
513
+ # Save raw response for offline inspection
514
+ try:
515
+ with open("llm_debug.json", "w", encoding="utf-8") as fh:
516
+ fh.write(resp_text)
517
+
518
+ if not text_reply:
519
+ logger.error("Could not extract text reply from response")
520
+ # Changed: Print the entire response structure for debugging
521
+ print("=" * 80)
522
+ print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
523
+ print(json.dumps(rj, indent=2))
524
+ print("=" * 80)
525
+ return []
526
+
527
+ # Changed: Print the extracted text reply
528
+ print("=" * 80)
529
+ print("EXTRACTED TEXT REPLY:")
530
+ print(text_reply)
531
+ except json.JSONDecodeError as e:
532
+ logger.error(f"Failed to parse JSON: {e}")
533
+ logger.error(f"JSON string that failed to parse: {js[:1000]}")
534
+ # Try to find any JSON-like structure
535
+ try:
536
+ # Try to extract any JSON array
537
+ import re
538
+ json_pattern = r'\[\s*\{.*?\}\s*\]'
539
+ matches = re.findall(json_pattern, text_reply, re.DOTALL)
540
+
541
+ # Log parsed results
542
+ logger.info(f"Parsed {len(parsed)} header items:")
543
+ for i, obj in enumerate(parsed[:10]): # Log first 10 items
544
+ logger.info(f" Item {i}: {obj}")
545
+
546
+ # Normalize parsed entries and return
547
+ page = int(obj.get('page')) if obj.get('page') else None
548
+ level = obj.get('suggested_level')
549
+ conf = float(obj.get('confidence') or 0)
550
+
551
+
552
+ if t and page is not None:
553
+ out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+ logger.info(f"Returning {len(out)} valid header entries")
569
+ return out
570
+
571
+
572
  logger.info("DataFrame head:")
573
  logger.info(df.head().to_string())
574
 
575
+
576
+
577
+
578
+
579
+
580
  # Save Excel to a file on disk
581
  output_path = "output.xlsx"
582
  try:
 
587
  if os.path.exists(output_path):
588
  file_size = os.path.getsize(output_path)
589
  logger.info(f"Output file exists, size: {file_size} bytes")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  gr.Textbox(label="LLM Prompt")
591
  ],
592
  outputs = gr.File(file_count="single", label="Download Excel")