File size: 1,394 Bytes
0a8644a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a43dcf5
0a8644a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a43dcf5
0a8644a
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import os
import json
import requests
from io import BytesIO
from datetime import datetime
from difflib import SequenceMatcher
import pandas as pd
from io import BytesIO
import fitz  # PyMuPDF
from collections import defaultdict, Counter
from urllib.parse import urlparse, unquote   
import os
from io import BytesIO
import re
import requests
import pandas as pd
import InitialMarkupsLLM_huggingFace
import fitz  # PyMuPDF
import re
import urllib.parse
import difflib

import copy
# import tsadropboxretrieval

import urllib.parse
import logging


# Set up logging to see everything
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # Print to console
        logging.FileHandler('debug.log', mode='w')  # Save to file
    ]
)

logger = logging.getLogger(__name__)



# Improved launch with debug mode enabled
iface = gr.Interface(
    fn=InitialMarkupsLLM_huggingFace.identify_headers_and_save_excel,
    inputs=[
        gr.Textbox(label="PDF URL"),
        gr.Textbox(label="Model Type"), # Default example
        gr.Textbox(label="LLM Prompt"),
        gr.Textbox(label="LLM Prompt Hierarchy")
    ],
    outputs=gr.File(label="Download Excel Results"),
    title="PDF Header Extractor"
)

# Launch with debug=True to see errors in the console
iface.launch(debug=True)