bllin001 commited on
Commit
2ec811e
·
verified ·
1 Parent(s): b2d5c09

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +86 -0
  2. requirements.txt +94 -0
main.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import subprocess
3
+
4
+ #=======================================================================================================================#
5
+
6
+ def clear_submit():
7
+ st.session_state["submit"] = False
8
+
9
+ if 'clicked' not in st.session_state:
10
+ st.session_state.clicked = False
11
+
12
+ def click_button():
13
+ st.session_state.clicked = True
14
+
15
+ st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto')
16
+
17
+ #=======================================================================================================================#
18
+
19
+ #--------------------------Sidebar--------------------------#
20
+
21
+ with st.sidebar:
22
+ # Add a title
23
+ st.title('Load document')
24
+
25
+ # Add a file uploader
26
+ uploaded_file = st.file_uploader(
27
+ "Upload file", type=["pdf"],
28
+ help="Only PDF files are supported",
29
+ on_change=clear_submit)
30
+
31
+ # Add a button
32
+ if uploaded_file:
33
+ st.markdown('---')
34
+ st.title('Extract text from PDF')
35
+ extract_text = st.button('Extract', help='Extract text from the document')
36
+
37
+ #=======================================================================================================================#
38
+
39
+ #--------------------------Main Page--------------------------#
40
+
41
+ if uploaded_file:
42
+
43
+ # Create a temporary folder
44
+ input_path = f'./files/{uploaded_file.name}'
45
+ # Create output file
46
+ output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd')
47
+ output_path = f'./files/'
48
+
49
+ # mmd path
50
+ mmd_path = f'./files/{output_file}'
51
+
52
+ with open(input_path, 'wb') as f:
53
+ f.write(uploaded_file.getbuffer())
54
+
55
+ # Load the model
56
+ @st.cache_resource(show_spinner=False)
57
+ def load_model(input_path, output_path):
58
+ subprocess.run(['nougat', input_path, '-o', output_path])
59
+
60
+
61
+ if extract_text:
62
+ with st.spinner('Extracting text...'):
63
+ load_model(input_path, output_path)
64
+
65
+ with open(mmd_path, 'r') as f:
66
+ mmd = f.read()
67
+ # move mmd to the session state
68
+ st.session_state["mmd"] = mmd
69
+
70
+ try:
71
+ st.write(st.session_state["mmd"])
72
+
73
+ with st.sidebar:
74
+ st.success('Text extracted successfully!')
75
+ st.markdown('---')
76
+ st.title('Download file')
77
+ download_output = st.download_button(label='Download',
78
+ data=st.session_state["mmd"],
79
+ file_name=output_file.replace('.mmd', '.md'),
80
+ mime='text/markdown')
81
+
82
+ except:
83
+ pass
84
+
85
+
86
+
requirements.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ albumentations==1.3.1
4
+ altair==5.2.0
5
+ async-timeout==4.0.3
6
+ attrs==23.2.0
7
+ blinker==1.7.0
8
+ cachetools==5.3.2
9
+ certifi==2024.2.2
10
+ cffi==1.16.0
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ cryptography==42.0.2
14
+ datasets==2.16.1
15
+ filelock==3.13.1
16
+ frozenlist==1.4.1
17
+ fsspec==2023.10.0
18
+ gitdb==4.0.11
19
+ GitPython==3.1.41
20
+ huggingface-hub==0.20.3
21
+ idna==3.6
22
+ imageio==2.33.1
23
+ Jinja2==3.1.3
24
+ joblib==1.3.2
25
+ jsonschema==4.21.1
26
+ jsonschema-specifications==2023.12.1
27
+ lazy_loader==0.3
28
+ Levenshtein==0.24.0
29
+ lightning==2.1.4
30
+ lightning-utilities==0.10.1
31
+ markdown-it-py==3.0.0
32
+ MarkupSafe==2.1.5
33
+ mdurl==0.1.2
34
+ mpmath==1.3.0
35
+ multidict==6.0.5
36
+ multiprocess==0.70.15
37
+ munch==4.0.0
38
+ networkx==3.2.1
39
+ nltk==3.8.1
40
+ nougat-ocr==0.1.17
41
+ numpy==1.26.4
42
+ opencv-python-headless==4.9.0.80
43
+ orjson==3.9.13
44
+ pandas==2.2.0
45
+ pillow==10.2.0
46
+ protobuf==4.25.2
47
+ pyarrow==15.0.0
48
+ pyarrow-hotfix==0.6
49
+ pycparser==2.21
50
+ pycryptodome==3.20.0
51
+ pydeck==0.8.1b0
52
+ pypdf==4.0.1
53
+ pypdfium2==4.26.0
54
+ python-Levenshtein==0.24.0
55
+ pytorch-lightning==2.1.4
56
+ pytz==2024.1
57
+ PyYAML==6.0.1
58
+ qudida==0.0.4
59
+ rapidfuzz==3.6.1
60
+ referencing==0.33.0
61
+ regex==2023.12.25
62
+ requests==2.31.0
63
+ rich==13.7.0
64
+ rpds-py==0.17.1
65
+ ruamel.yaml==0.18.5
66
+ ruamel.yaml.clib==0.2.8
67
+ safetensors==0.4.2
68
+ scikit-image==0.22.0
69
+ scikit-learn==1.4.0
70
+ scipy==1.12.0
71
+ sconf==0.2.5
72
+ sentencepiece==0.1.99
73
+ smmap==5.0.1
74
+ streamlit==1.31.0
75
+ sympy==1.12
76
+ tenacity==8.2.3
77
+ threadpoolctl==3.2.0
78
+ tifffile==2024.1.30
79
+ timm==0.5.4
80
+ tokenizers==0.15.1
81
+ toml==0.10.2
82
+ toolz==0.12.1
83
+ torch==2.2.0
84
+ torchmetrics==1.3.0.post0
85
+ torchvision==0.17.0
86
+ tqdm==4.66.1
87
+ transformers==4.37.2
88
+ tzdata==2023.4
89
+ tzlocal==5.2
90
+ urllib3==2.2.0
91
+ validators==0.22.0
92
+ xxhash==3.4.1
93
+ yarl==1.9.4
94
+