jrpark commited on
Commit
d1aa69e
ยท
verified ยท
1 Parent(s): 4a5a856

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ static/page11_img1.png filter=lfs diff=lfs merge=lfs -text
37
+ static/page16_img1.png filter=lfs diff=lfs merge=lfs -text
38
+ static/page2_img1.png filter=lfs diff=lfs merge=lfs -text
39
+ static/page3_img1.png filter=lfs diff=lfs merge=lfs -text
40
+ static/page4_img1.png filter=lfs diff=lfs merge=lfs -text
41
+ static/page7_img1.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ data/
174
+ .DS_Store
175
+ .temp
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,68 @@
1
  ---
2
- title: Pdf2html
3
- emoji: โšก
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: pdf2html
3
+ app_file: web_interface.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.20.1
 
 
6
  ---
7
+ # pdf2html
8
 
9
+ PDF ํŒŒ์ผ์„ ๋‹จ์ผ ์ปฌ๋Ÿผ HTML๋กœ ๋ณ€ํ™˜ํ•˜๋Š” Python ํŒจํ‚ค์ง€์ž…๋‹ˆ๋‹ค.
10
+
11
+ ## ์„ค์น˜
12
+
13
+ ```bash
14
+ # Poetry๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ๊ฒฝ์šฐ
15
+ poetry install
16
+
17
+ # ๋˜๋Š” pip ์‚ฌ์šฉ
18
+ pip install pdf2html
19
+ ```
20
+
21
+ ## ํ•„์š” ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ
22
+
23
+ ```bash
24
+ pip install PyMuPDF beautifulsoup4 langchain gradio gradio-pdf
25
+ ```
26
+
27
+ ## ์‚ฌ์šฉ๋ฒ•
28
+
29
+ ### ๋ช…๋ น์ค„ ์ธํ„ฐํŽ˜์ด์Šค
30
+
31
+ ```bash
32
+ # ์ง์ ‘ ์‹คํ–‰
33
+ poerty run python -m pdf2html ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….pdf
34
+ poerty run python -m pdf2html ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….pdf --output ์ถœ๋ ฅ๋””๋ ‰ํ† ๋ฆฌ
35
+
36
+ # ์„ค์น˜ ํ›„ ์‹คํ–‰
37
+ pdf2html ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….pdf
38
+ pdf2html ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….pdf --output ์ถœ๋ ฅ๋””๋ ‰ํ† ๋ฆฌ
39
+ ```
40
+
41
+ ### ์›น ์ธํ„ฐํŽ˜์ด์Šค
42
+
43
+ ```bash
44
+ # ์ง์ ‘ ์‹คํ–‰
45
+ poetry run python -m web_interface
46
+
47
+ # ์„ค์น˜ ํ›„ ์‹คํ–‰
48
+ pdf2html-web
49
+ ```
50
+
51
+ ### Python ์ฝ”๋“œ์—์„œ ์‚ฌ์šฉ
52
+
53
+ ```python
54
+ from pdf2html import PDFToHTMLConverter
55
+
56
+ converter = PDFToHTMLConverter("๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….pdf")
57
+ output_path = converter.convert()
58
+ print(f"๋ณ€ํ™˜ ์™„๋ฃŒ: {output_path}")
59
+ ```
60
+
61
+ ## ์ฃผ์š” ๊ธฐ๋Šฅ
62
+
63
+ - PDF ๋ฌธ์„œ์˜ ํ…์ŠคํŠธ, ์ด๋ฏธ์ง€, ํ‘œ ์ถ”์ถœ
64
+ - 1๋‹จ ์„ธ๋กœ ๋ ˆ์ด์•„์›ƒ์œผ๋กœ ์žฌ๊ตฌ์„ฑ
65
+ - ๋ฌธ๋‹จ ๊ตฌ์กฐ ๋ฐ ์„œ์‹ ์œ ์ง€
66
+ - ์ด๋ฏธ์ง€ ์ž๋™ ์ถ”์ถœ ๋ฐ ํฌํ•จ
67
+ - ํ‘œ ๊ตฌ์กฐ ๊ฐ์ง€ ๋ฐ HTML ํ…Œ์ด๋ธ”๋กœ ๋ณ€ํ™˜
68
+ - Gradio ๊ธฐ๋ฐ˜ ์›น ์ธํ„ฐํŽ˜์ด์Šค ์ œ๊ณต
__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from convert import PDFToHTMLConverter
2
+
3
+ __version__ = "0.1.0"
cli.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from convert import PDFToHTMLConverter
3
+
4
+ def main():
5
+ parser = argparse.ArgumentParser(description="PDF๋ฅผ HTML๋กœ ๋ณ€ํ™˜")
6
+ parser.add_argument("pdf_path", help="๋ณ€ํ™˜ํ•  PDF ํŒŒ์ผ ๊ฒฝ๋กœ")
7
+ parser.add_argument("--output", "-o", help="์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ (๊ธฐ๋ณธ๊ฐ’: PDF์™€ ๊ฐ™์€ ๋””๋ ‰ํ† ๋ฆฌ)")
8
+
9
+ args = parser.parse_args()
10
+
11
+ converter = PDFToHTMLConverter(args.pdf_path, args.output)
12
+ output_path = converter.convert()
13
+
14
+ print(f"๋ณ€ํ™˜ ์™„๋ฃŒ: {output_path}")
15
+
16
+ if __name__ == "__main__":
17
+ main()
convert.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import fitz # PyMuPDF
4
+ import base64
5
+ import re
6
+ import shutil
7
+
8
+ class PDFToHTMLConverter:
9
+ def __init__(self, pdf_path, output_dir=None):
10
+ """
11
+ PDF๋ฅผ HTML๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํด๋ž˜์Šค ์ดˆ๊ธฐํ™”
12
+
13
+ Args:
14
+ pdf_path (str): PDF ํŒŒ์ผ ๊ฒฝ๋กœ
15
+ output_dir (str, optional): ์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ. ๊ธฐ๋ณธ๊ฐ’์€ PDF์™€ ๊ฐ™์€ ๋””๋ ‰ํ† ๋ฆฌ.
16
+ """
17
+ self.pdf_path = pdf_path
18
+ self.pdf_name = Path(pdf_path).stem
19
+
20
+ # ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ ๋””๋ ‰ํ† ๋ฆฌ์— .temp ํด๋” ์ƒ์„ฑ
21
+ current_dir = Path.cwd()
22
+ self.temp_dir = current_dir / ".temp"
23
+
24
+ # ์ž…๋ ฅ PDF ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
25
+ self.pdf_dir = self.temp_dir / "temp_input_pdf"
26
+
27
+ # ์ถœ๋ ฅ HTML ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
28
+ self.output_dir = self.temp_dir / "temp_output_html"
29
+
30
+ # ์ด๋ฏธ์ง€ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
31
+ self.img_dir = self.output_dir / "images"
32
+
33
+ # ํ•„์š”ํ•œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
34
+ self.temp_dir.mkdir(exist_ok=True)
35
+ self.pdf_dir.mkdir(exist_ok=True)
36
+ self.output_dir.mkdir(exist_ok=True)
37
+ self.img_dir.mkdir(exist_ok=True)
38
+
39
+ # ๊ณ ์ •๋œ ํŒŒ์ผ ์ด๋ฆ„ ์„ค์ •
40
+ self.fixed_pdf_path = self.pdf_dir / "current.pdf"
41
+
42
+ # ํŒŒ์ผ ๊ฒฝ๋กœ๊ฐ€ ๋ฌธ์ž์—ด์ธ ๊ฒฝ์šฐ Path ๊ฐ์ฒด๋กœ ๋ณ€ํ™˜
43
+ if isinstance(pdf_path, str):
44
+ pdf_path = Path(pdf_path)
45
+
46
+ # ์›๋ณธ PDF ํŒŒ์ผ์ด ๊ณ ์ • ๊ฒฝ๋กœ์™€ ๋‹ค๋ฅธ ๊ฒฝ์šฐ์—๋งŒ ๋ณต์‚ฌ
47
+ if pdf_path != self.fixed_pdf_path:
48
+ shutil.copy2(str(pdf_path), str(self.fixed_pdf_path))
49
+ print(f"PDF ํŒŒ์ผ ๋ณต์‚ฌ๋จ: {pdf_path} -> {self.fixed_pdf_path}")
50
+ else:
51
+ print(f"PDF ํŒŒ์ผ์ด ์ด๋ฏธ ์˜ฌ๋ฐ”๋ฅธ ์œ„์น˜์— ์žˆ์Šต๋‹ˆ๋‹ค: {self.fixed_pdf_path}")
52
+
53
+ # PyMuPDF ๋ฌธ์„œ ๊ฐ์ฒด ์—ด๊ธฐ
54
+ self.doc = fitz.open(self.fixed_pdf_path)
55
+
56
+ # ๊ฒฐ๊ณผ HTML
57
+ self.html_content = ""
58
+ self.text_html_content = ""
59
+ self.media_html_content = ""
60
+
61
+ def _extract_text_with_structure(self, page):
62
+ """
63
+ ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๊ณ  ๊ธฐ๋ณธ ๊ตฌ์กฐ๋ฅผ ์œ ์ง€
64
+
65
+ Args:
66
+ page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
67
+
68
+ Returns:
69
+ str: ๊ตฌ์กฐํ™”๋œ HTML ํ…์ŠคํŠธ
70
+ """
71
+ blocks = page.get_text("dict")["blocks"]
72
+ html_text = []
73
+
74
+ for block in blocks:
75
+ if block["type"] == 0: # ํ…์ŠคํŠธ ๋ธ”๋ก
76
+ text_lines = []
77
+ for line in block["lines"]:
78
+ line_text = ""
79
+ for span in line["spans"]:
80
+ # ํฐํŠธ ํฌ๊ธฐ์™€ ์Šคํƒ€์ผ ๋ถ„์„
81
+ font_size = span["size"]
82
+ is_bold = "bold" in span["font"].lower() or span.get("flags", 0) & 16 != 0
83
+ is_italic = "italic" in span["font"].lower() or span.get("flags", 0) & 1 != 0
84
+
85
+ text = span["text"]
86
+
87
+ # ํฐํŠธ ํฌ๊ธฐ์— ๋”ฐ๋ผ ์ œ๋ชฉ ๋˜๋Š” ์ผ๋ฐ˜ ํ…์ŠคํŠธ๋กœ ๋ถ„๋ฅ˜
88
+ if font_size > 14: # ํฐ ํฐํŠธ๋Š” ์ œ๋ชฉ์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ
89
+ if is_bold:
90
+ text = f"<h1>{text}</h1>"
91
+ else:
92
+ text = f"<h2>{text}</h2>"
93
+ elif font_size > 12:
94
+ if is_bold:
95
+ text = f"<h3>{text}</h3>"
96
+ else:
97
+ text = f"<h4>{text}</h4>"
98
+ else:
99
+ if is_bold:
100
+ text = f"<strong>{text}</strong>"
101
+ if is_italic:
102
+ text = f"<em>{text}</em>"
103
+
104
+ line_text += text
105
+
106
+ text_lines.append(line_text)
107
+
108
+ # ํ…์ŠคํŠธ ๋ผ์ธ์„ ๋‹จ๋ฝ์œผ๋กœ ๊ฒฐํ•ฉ
109
+ if text_lines:
110
+ paragraph = " ".join(text_lines)
111
+ html_text.append(f"<p>{paragraph}</p>")
112
+
113
+ return "\n".join(html_text)
114
+
115
+ def _extract_images(self, page, page_num):
116
+ """
117
+ ํŽ˜์ด์ง€์—์„œ ์ด๋ฏธ์ง€ ์ถ”์ถœ
118
+
119
+ Args:
120
+ page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
121
+ page_num (int): ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
122
+
123
+ Returns:
124
+ list: ์ด๋ฏธ์ง€ HTML ํƒœ๊ทธ ๋ชฉ๋ก
125
+ """
126
+ image_tags = []
127
+ image_list = page.get_images(full=True)
128
+
129
+ for img_idx, img_info in enumerate(image_list):
130
+ try:
131
+ xref = img_info[0]
132
+ base_img = self.doc.extract_image(xref)
133
+ image_bytes = base_img["image"]
134
+
135
+ # ์ด๋ฏธ์ง€ ํฌ๋งท ํ™•์ธ (๊ธฐ๋ณธ๊ฐ’์€ png)
136
+ image_ext = base_img["ext"]
137
+ if image_ext.lower() not in ["jpeg", "jpg", "png"]:
138
+ image_ext = "png"
139
+
140
+ # ๊ณ ์ •๋œ ๊ฒฝ๋กœ์— ์ด๋ฏธ์ง€ ์ €์žฅ
141
+ image_filename = f"page{page_num+1}_img{img_idx+1}.{image_ext}"
142
+ image_path = self.img_dir / image_filename
143
+
144
+ # ์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
145
+ if not self.img_dir.exists():
146
+ self.img_dir.mkdir(parents=True, exist_ok=True)
147
+
148
+ with open(image_path, "wb") as img_file:
149
+ img_file.write(image_bytes)
150
+
151
+ # ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ์ฝ”๋“œ
152
+ print(f"์ด๋ฏธ์ง€ ์ €์žฅ: {image_path} (ํฌ๊ธฐ: {len(image_bytes)} ๋ฐ”์ดํŠธ)")
153
+
154
+ # ์ด๋ฏธ์ง€ ํƒœ๊ทธ ์ƒ์„ฑ (์ƒ๋Œ€ ๊ฒฝ๋กœ ์‚ฌ์šฉ)
155
+ # ๊ฒฝ๋กœ๊ฐ€ HTML ํŒŒ์ผ์—์„œ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ ์ฐธ์กฐ๋  ์ˆ˜ ์žˆ๋„๋ก ํ•ฉ๋‹ˆ๋‹ค
156
+ rel_img_path = f"images/{image_filename}"
157
+ img_tag = f'<div class="image-container"><img src="{rel_img_path}" alt="Page {page_num+1} Image {img_idx+1}" style="max-width:100%; height:auto;"/></div>'
158
+ image_tags.append(img_tag)
159
+
160
+ except Exception as e:
161
+ print(f"์ด๋ฏธ์ง€ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
162
+
163
+ return image_tags
164
+
165
+ def _extract_tables(self, page):
166
+ """
167
+ ํŽ˜์ด์ง€์—์„œ ํ‘œ ์ถ”์ถœ ์‹œ๋„
168
+
169
+ Args:
170
+ page (fitz.Page): PDF ํŽ˜์ด์ง€ ๊ฐ์ฒด
171
+
172
+ Returns:
173
+ list: ํ‘œ HTML ํƒœ๊ทธ ๋ชฉ๋ก
174
+ """
175
+ # ํ‘œ ๊ฐ์ง€ ๋ฐ ์ถ”์ถœ์€ ๋ณต์žกํ•œ ์ž‘์—…์ž…๋‹ˆ๋‹ค.
176
+ # ์ด ๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ์—์„œ๋Š” ํ…Œ์ด๋ธ”๋กœ ๋ณด์ด๋Š” ๊ตฌ์กฐ๋ฅผ ๊ฐ์ง€ํ•˜๋Š” ๊ธฐ๋ณธ์ ์ธ ์ ‘๊ทผ ๋ฐฉ์‹์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
177
+ tables = []
178
+
179
+ # ํŽ˜์ด์ง€์˜ ํ…์ŠคํŠธ ๋ธ”๋ก์„ ๋ถ„์„
180
+ blocks = page.get_text("dict")["blocks"]
181
+
182
+ # ๋†’์ด๊ฐ€ ๋น„์Šทํ•œ ํ…์ŠคํŠธ ๋ธ”๋ก์ด ๊ฐ€๋กœ๋กœ ์ •๋ ฌ๋œ ๊ฒฝ์šฐ ํ…Œ์ด๋ธ” ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ์Œ
183
+ table_candidates = []
184
+
185
+ for i, block in enumerate(blocks):
186
+ if block["type"] == 0: # ํ…์ŠคํŠธ ๋ธ”๋ก
187
+ # ํ…์ŠคํŠธ ๋ธ”๋ก์˜ ์œ„์น˜ ์ •๋ณด
188
+ x0, y0, x1, y1 = block["bbox"]
189
+
190
+ # ๊ฐ™์€ ํ–‰์— ์žˆ๋Š” ๋‹ค๋ฅธ ํ…์ŠคํŠธ ๋ธ”๋ก ์ฐพ๊ธฐ
191
+ same_row_blocks = []
192
+
193
+ for j, other_block in enumerate(blocks):
194
+ if i != j and other_block["type"] == 0:
195
+ ox0, oy0, ox1, oy1 = other_block["bbox"]
196
+
197
+ # y ์ขŒํ‘œ๊ฐ€ ๋น„์Šทํ•˜๋ฉด ๊ฐ™์€ ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ์Œ
198
+ if abs(y0 - oy0) < 5 and abs(y1 - oy1) < 5:
199
+ same_row_blocks.append(j)
200
+
201
+ # ๊ฐ™์€ ํ–‰์— ์—ฌ๋Ÿฌ ํ…์ŠคํŠธ ๋ธ”๋ก์ด ์žˆ์œผ๋ฉด ํ…Œ์ด๋ธ” ํ–‰์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ
202
+ if len(same_row_blocks) >= 2:
203
+ table_candidates.append((i, same_row_blocks))
204
+
205
+ # ํ…Œ์ด๋ธ” ํ›„๋ณด๊ฐ€ ์žˆ์œผ๋ฉด HTML ํ…Œ์ด๋ธ”๋กœ ๋ณ€ํ™˜
206
+ if table_candidates:
207
+ table_html = "<table border='1'>\n"
208
+
209
+ for row_idx, row_blocks in table_candidates:
210
+ table_html += "<tr>\n"
211
+
212
+ # ํ˜„์žฌ ๋ธ”๋ก ์ถ”๊ฐ€
213
+ block_text = ""
214
+ for line in blocks[row_idx]["lines"]:
215
+ for span in line["spans"]:
216
+ block_text += span["text"] + " "
217
+
218
+ table_html += f"<td>{block_text.strip()}</td>\n"
219
+
220
+ # ๊ฐ™์€ ํ–‰์˜ ๋‹ค๋ฅธ ๋ธ”๋ก ์ถ”๊ฐ€
221
+ for block_idx in row_blocks:
222
+ block_text = ""
223
+ for line in blocks[block_idx]["lines"]:
224
+ for span in line["spans"]:
225
+ block_text += span["text"] + " "
226
+
227
+ table_html += f"<td>{block_text.strip()}</td>\n"
228
+
229
+ table_html += "</tr>\n"
230
+
231
+ table_html += "</table>"
232
+ tables.append(table_html)
233
+
234
+ return tables
235
+
236
+ def _create_html_template(self, title, content, css_additional=""):
237
+ """
238
+ HTML ํ…œํ”Œ๋ฆฟ ์ƒ์„ฑ - ๋‹คํฌ ํ…Œ๋งˆ ์ ์šฉ
239
+
240
+ Args:
241
+ title (str): HTML ์ œ๋ชฉ
242
+ content (str): HTML ๋ณธ๋ฌธ ๋‚ด์šฉ
243
+ css_additional (str): ์ถ”๊ฐ€ CSS ์Šคํƒ€์ผ
244
+
245
+ Returns:
246
+ str: ์™„์„ฑ๋œ HTML ๋ฌธ์ž์—ด
247
+ """
248
+ return f"""<!DOCTYPE html>
249
+ <html lang="ko">
250
+ <head>
251
+ <meta charset="UTF-8">
252
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
253
+ <title>{title}</title>
254
+ <style>
255
+ body {{
256
+ font-family: Arial, sans-serif;
257
+ line-height: 1.6;
258
+ margin: 0;
259
+ padding: 0;
260
+ height: 100vh;
261
+ overflow-y: auto;
262
+ background-color: #2a2a2a;
263
+ color: #ffffff;
264
+ }}
265
+ .page-title {{
266
+ padding: 10px 20px;
267
+ margin: 0;
268
+ background-color: #333;
269
+ color: white;
270
+ position: sticky;
271
+ top: 0;
272
+ z-index: 10;
273
+ }}
274
+ .content-container {{
275
+ padding: 20px;
276
+ }}
277
+ .image-container {{
278
+ text-align: center;
279
+ margin: 20px 0;
280
+ }}
281
+ img {{
282
+ max-width: 100%;
283
+ height: auto;
284
+ border: 1px solid #444;
285
+ }}
286
+ table {{
287
+ border-collapse: collapse;
288
+ width: 100%;
289
+ margin: 20px 0;
290
+ background-color: #333;
291
+ }}
292
+ td, th {{
293
+ border: 1px solid #555;
294
+ padding: 8px;
295
+ color: #fff;
296
+ }}
297
+ h1, h2, h3, h4, p, span, div {{
298
+ color: #fff;
299
+ }}
300
+ .media-item {{
301
+ margin-bottom: 30px;
302
+ padding-bottom: 20px;
303
+ border-bottom: 1px solid #444;
304
+ }}
305
+ .media-item-heading {{
306
+ background-color: #444;
307
+ padding: 5px 10px;
308
+ margin-bottom: 10px;
309
+ font-weight: bold;
310
+ border-left: 3px solid #E67E22;
311
+ color: #fff;
312
+ }}
313
+ .page-text {{
314
+ margin-bottom: 30px;
315
+ border-bottom: 1px solid #444;
316
+ padding-bottom: 20px;
317
+ }}
318
+ /* ์Šคํฌ๋กค๋ฐ” ์Šคํƒ€์ผ */
319
+ ::-webkit-scrollbar {{
320
+ width: 8px;
321
+ }}
322
+ ::-webkit-scrollbar-track {{
323
+ background: #333;
324
+ }}
325
+ ::-webkit-scrollbar-thumb {{
326
+ background: #666;
327
+ border-radius: 4px;
328
+ }}
329
+ ::-webkit-scrollbar-thumb:hover {{
330
+ background: #777;
331
+ }}
332
+ /* ๋งํฌ ์Šคํƒ€์ผ */
333
+ a {{
334
+ color: #3498db;
335
+ text-decoration: none;
336
+ }}
337
+ a:hover {{
338
+ text-decoration: underline;
339
+ }}
340
+ {css_additional}
341
+ </style>
342
+ </head>
343
+ <body>
344
+ <h1 class="page-title">{title}</h1>
345
+ <div class="content-container">
346
+ {content}
347
+ </div>
348
+ </body>
349
+ </html>"""
350
+
351
+ def convert(self):
352
+ """
353
+ PDF๋ฅผ ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML๋กœ ๋ถ„๋ฆฌํ•˜์—ฌ ๋ณ€ํ™˜
354
+
355
+ Returns:
356
+ tuple: ํ…์ŠคํŠธ HTML ๊ฒฝ๋กœ, ๋ฏธ๋””์–ด HTML ๊ฒฝ๋กœ
357
+ """
358
+ # ํ…์ŠคํŠธ ์ปฌ๋Ÿผ๊ณผ ๋ฏธ๋””์–ด ์ปฌ๋Ÿผ์„ ์œ„ํ•œ ์ปจํ…์ธ  ์ค€๋น„
359
+ text_content = []
360
+ media_content = []
361
+ media_order = 0 # ๋ฏธ๋””์–ด ์•„์ดํ…œ ์ˆœ์„œ
362
+
363
+ # ๊ฐ ํŽ˜์ด์ง€ ์ฒ˜๋ฆฌ
364
+ for page_num, page in enumerate(self.doc):
365
+ # ํ…์ŠคํŠธ ์ถ”์ถœ
366
+ text_html = self._extract_text_with_structure(page)
367
+ text_content.append(f"\n<div class='page-text' id='page-text-{page_num+1}'>\n")
368
+ text_content.append(f"<h3>ํŽ˜์ด์ง€ {page_num+1}</h3>")
369
+ text_content.append(text_html)
370
+ text_content.append("\n</div>\n")
371
+
372
+ # ํ‘œ ์ถ”์ถœ
373
+ tables = self._extract_tables(page)
374
+ for table_idx, table in enumerate(tables):
375
+ media_order += 1
376
+ media_content.append(f"""
377
+ <div class="media-item" id="table-{page_num+1}-{table_idx+1}" data-page="{page_num+1}">
378
+ <div class="media-item-heading">ํ‘œ {media_order} - ํŽ˜์ด์ง€ {page_num+1}</div>
379
+ {table}
380
+ </div>
381
+ """)
382
+
383
+ # ์ด๋ฏธ์ง€ ์ถ”์ถœ
384
+ images = self._extract_images(page, page_num)
385
+ for img_idx, img_tag in enumerate(images):
386
+ media_order += 1
387
+ media_content.append(f"""
388
+ <div class="media-item" id="image-{page_num+1}-{img_idx+1}" data-page="{page_num+1}">
389
+ <div class="media-item-heading">์ด๋ฏธ์ง€ {media_order} - ํŽ˜์ด์ง€ {page_num+1}</div>
390
+ {img_tag}
391
+ </div>
392
+ """)
393
+
394
+ # ํ…์ŠคํŠธ HTML ์ƒ์„ฑ
395
+ text_html_content = self._create_html_template(
396
+ f"{self.pdf_name} - ํ…์ŠคํŠธ",
397
+ "\n".join(text_content)
398
+ )
399
+
400
+ # ๋ฏธ๋””์–ด HTML ์ƒ์„ฑ
401
+ media_html_content = self._create_html_template(
402
+ f"{self.pdf_name} - ํ‘œ ๋ฐ ์ด๋ฏธ์ง€",
403
+ "\n".join(media_content)
404
+ )
405
+
406
+ # ํ…์ŠคํŠธ HTML ํŒŒ์ผ ์ €์žฅ
407
+ text_output_path = self.output_dir / "text.html"
408
+ with open(text_output_path, "w", encoding="utf-8") as html_file:
409
+ html_file.write(text_html_content)
410
+
411
+ # ๋ฏธ๋””์–ด HTML ํŒŒ์ผ ์ €์žฅ
412
+ media_output_path = self.output_dir / "media.html"
413
+ with open(media_output_path, "w", encoding="utf-8") as html_file:
414
+ html_file.write(media_html_content)
415
+
416
+ # ๊ฒฐ๊ณผ ์ €์žฅ
417
+ self.text_html_content = text_html_content
418
+ self.media_html_content = media_html_content
419
+
420
+ return str(text_output_path), str(media_output_path)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "pdf2html"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["jrpark <jrpark@petobio.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.13"
10
+ pymupdf = "^1.25.3"
11
+ beautifulsoup4 = "^4.13.3"
12
+ langchain = "^0.3.20"
13
+ gradio = "^5.20.1"
14
+ gradio-pdf = "^0.0.22"
15
+
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ ipykernel = "^6.29.5"
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PyMuPDF==1.25.3
2
+ beautifulsoup4==4.13.3
3
+ langchain==0.3.20
4
+ gradio==5.20.1
5
+ gradio-pdf==0.0.22
static/page11_img1.png ADDED

Git LFS Details

  • SHA256: 6016fd31128120b19bcdfbc2d609d2db6acb94012398fc051cab2a1bdcc5b381
  • Pointer size: 131 Bytes
  • Size of remote file: 144 kB
static/page16_img1.png ADDED

Git LFS Details

  • SHA256: a5ce230c19992f5290dae4f8f9a02d8e1e0a106cf45cb7d4fb4c25cf7c2b5adc
  • Pointer size: 131 Bytes
  • Size of remote file: 287 kB
static/page2_img1.png ADDED

Git LFS Details

  • SHA256: 18fe50f4522c2760b6e92d6d426807c9084c2ee7989ff63fcca6c3ab8dae9c05
  • Pointer size: 131 Bytes
  • Size of remote file: 370 kB
static/page3_img1.png ADDED

Git LFS Details

  • SHA256: d8534ecc94f8a147c69468e6e1697213014997e64836190f4689372e502d349d
  • Pointer size: 131 Bytes
  • Size of remote file: 226 kB
static/page4_img1.png ADDED

Git LFS Details

  • SHA256: 2100680f5528e544303f001b8850e0cd31b038aa39fe9daa6992fa363b4ffa56
  • Pointer size: 131 Bytes
  • Size of remote file: 181 kB
static/page7_img1.png ADDED

Git LFS Details

  • SHA256: 26c8448c9d925ab42bf092ad2d6bcc0177525119006d1e21dafe13d2879d36ba
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
test.ipynb ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "```mermaid\n",
8
+ "graph TD\n",
9
+ " A[LLM in Veterinary] --> B[ํ”„๋กฌํ”„ํŠธ ์—”์ง€๋‹ˆ์–ด๋ง]\n",
10
+ " A --> C[ํŒŒ์ธํŠœ๋‹]\n",
11
+ " A --> D[ํ‰๊ฐ€ ๋ฐฉ๋ฒ•]\n",
12
+ " A --> E[์‹ค์ฆ ์‚ฌ๋ก€]\n",
13
+ " A --> F[๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ]\n",
14
+ " \n",
15
+ " B --> B1(Structured Prompt)\n",
16
+ " B --> B2(Contextual Guidance)\n",
17
+ " B --> B3(Chain-of-Thought)\n",
18
+ " \n",
19
+ " C --> C1(Alpaca-7B)\n",
20
+ " C --> C2(Data-efficient)\n",
21
+ " C --> C3(SNOMED-CT)\n",
22
+ " \n",
23
+ " D --> D1(MM-Vet)\n",
24
+ " D --> D2(Stratified F1)\n",
25
+ " D --> D3(Exact Match)\n",
26
+ " \n",
27
+ " E --> E1(Diagnosis Coding)\n",
28
+ " E --> E2(Educational GPT)\n",
29
+ " E --> E3(Epidemiological Modeling)\n",
30
+ " \n",
31
+ " F --> F1(VLMs)\n",
32
+ " F --> F2(PyMuPDF4LLM)\n",
33
+ " F --> F3(Medical Imaging)\n",
34
+ "```"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "markdown",
39
+ "metadata": {},
40
+ "source": [
41
+ "```mermaid\n",
42
+ "mindmap\n",
43
+ " root((์ˆ˜์˜ํ•™ ๋ถ„์•ผ\n",
44
+ " LLM ์—ฐ๊ตฌ))\n",
45
+ " ์ž„์ƒ ์ง„๋‹จ ์ง€์›\n",
46
+ " ์ฆ์ƒ ๊ธฐ๋ฐ˜ ์งˆ๋ณ‘ ์˜ˆ์ธก\n",
47
+ " ๊ณ ์–‘์ด ํ–‰๋™ ๋ถ„์„\n",
48
+ " ๊ฐœ ํ”ผ๋ถ€ ์งˆํ™˜ ์ง„๋‹จ\n",
49
+ " ์˜๋ฃŒ ์˜์ƒ ๋ถ„์„\n",
50
+ " ๋ฐฉ์‚ฌ์„  ์˜์ƒ ํ•ด์„\n",
51
+ " ํ”ผ๋ถ€ ๋ณ‘๋ณ€ ์ธ์‹\n",
52
+ " ์ž๋™ํ™”๋œ ์ง„๋‹จ ์‹œ์Šคํ…œ\n",
53
+ " ๋‹ค์ค‘ ์ฆ์ƒ ํ†ตํ•ฉ ๋ถ„์„\n",
54
+ " ์˜๋ฃŒ ๋ฐ์ดํ„ฐ ๋ถ„์„\n",
55
+ " ์ „์ž ์˜๋ฌด ๊ธฐ๋ก ๋ถ„์„\n",
56
+ " ์น˜๋ฃŒ ํŒจํ„ด ๋ฐœ๊ฒฌ\n",
57
+ " ์น˜๋ฃŒ ํšจ๊ณผ ์˜ˆ์ธก\n",
58
+ " ์—ญํ•™ ๋ฐ์ดํ„ฐ ๋งˆ์ด๋‹\n",
59
+ " ์งˆ๋ณ‘ ๋ฐœ์ƒ ํŒจํ„ด\n",
60
+ " ์ง€์—ญ ์œ ํ–‰ ์˜ˆ์ธก\n",
61
+ " ์ƒ์ฒด์‹ ํ˜ธ ํ•ด์„\n",
62
+ " ์ง€์†์  ๋ชจ๋‹ˆํ„ฐ๋ง ์‹œ์Šคํ…œ\n",
63
+ " ๋งž์ถคํ˜• ์น˜๋ฃŒ ๊ณ„ํš\n",
64
+ " ์•ฝ๋ฌผ ์ฒ˜๋ฐฉ ์ตœ์ ํ™”\n",
65
+ " ์ข… ํŠน์ด์  ์šฉ๋Ÿ‰ ์กฐ์ ˆ\n",
66
+ " ์•ฝ๋ฌผ ์ƒํ˜ธ์ž‘์šฉ ์˜ˆ์ธก\n",
67
+ " ์ˆ˜์ˆ  ๊ณ„ํš ์ง€์›\n",
68
+ " 3D ๋ชจ๋ธ๋ง ํ†ตํ•ฉ\n",
69
+ " ์žฌํ™œ ํ”„๋กœ๊ทธ๋žจ ์„ค๊ณ„\n",
70
+ " ๋™๋ฌผ ๋งž์ถคํ˜• ์šด๋™ ๊ณ„ํš\n",
71
+ " ๊ต์œก ๋ฐ ํ›ˆ๋ จ\n",
72
+ " ์ˆ˜์˜ํ•™ ํ•™์ƒ ๊ต์œก\n",
73
+ " ๊ฐ€์ƒ ์‚ฌ๋ก€ ์‹œ๋ฎฌ๋ ˆ์ด์…˜\n",
74
+ " ๋Œ€ํ™”ํ˜• ํ•™์Šต ์‹œ์Šคํ…œ\n",
75
+ " ์ˆ˜์˜์‚ฌ ์ง€์† ๊ต์œก\n",
76
+ " ์ตœ์‹  ์—ฐ๊ตฌ ์š”์•ฝ ์ƒ์„ฑ\n",
77
+ " ์ง„๋‹จ ์ง€์นจ ์—…๋ฐ์ดํŠธ\n",
78
+ " ํ”„๋กฌํ”„ํŠธ ์—”์ง€๋‹ˆ์–ด๋ง\n",
79
+ " ์ˆ˜์˜ํ•™ ํŠนํ™” ํ”„๋กฌํ”„ํŠธ\n",
80
+ " ํ•ด๋ถ€ํ•™์  ์ •ํ™•์„ฑ ํ–ฅ์ƒ\n",
81
+ " ์ข… ํŠน์ด์  ์ง€์‹ ํ†ตํ•ฉ\n",
82
+ " ๋‹ค๋‹จ๊ณ„ ์ถ”๋ก  ํ”„๋กฌํ”„ํŠธ\n",
83
+ " ๋ณต์žกํ•œ ์งˆ๋ณ‘ ์ง„๋‹จ\n",
84
+ " ๋ชจ๋ธ ํ‰๊ฐ€ ๋ฐ ๊ฒ€์ฆ\n",
85
+ " ์ž„์ƒ ์ •ํ™•๋„ ํ‰๊ฐ€\n",
86
+ " ์ „๋ฌธ๊ฐ€ ๊ฒ€์ฆ ๋ฐฉ๋ฒ•๋ก \n",
87
+ " ํ™˜์ž ๊ฒฐ๊ณผ ์ถ”์  ์—ฐ๊ตฌ\n",
88
+ " ์œค๋ฆฌ์  ๊ณ ๋ ค์‚ฌํ•ญ\n",
89
+ " ์˜์‚ฌ๊ฒฐ์ • ํˆฌ๋ช…์„ฑ\n",
90
+ " ์˜ค์ง„ ์œ„ํ—˜ ๊ด€๋ฆฌ\n",
91
+ " ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์‘์šฉ\n",
92
+ " ์ด๋ฏธ์ง€-ํ…์ŠคํŠธ ํ†ตํ•ฉ ๋ถ„์„\n",
93
+ " ๋ฐฉ์‚ฌ์„  ์˜์ƒ + ์ž„์ƒ ๊ธฐ๋ก\n",
94
+ " ํ˜„๋ฏธ๊ฒฝ ์˜์ƒ + ํ˜ˆ์•ก ๊ฒ€์‚ฌ\n",
95
+ " ์˜ค๋””์˜ค ๊ธฐ๋ฐ˜ ์ง„๋‹จ\n",
96
+ " ์‹ฌ์žฅ ๋ฐ ํ ์†Œ๋ฆฌ ๋ถ„์„\n",
97
+ " ๋™๋ฌผ ๋ฐœ์„ฑ ํ•ด์„\n",
98
+ " ํ•™์ œ๊ฐ„ ์—ฐ๊ตฌ\n",
99
+ " ์›ํ—ฌ์Šค ์ ‘๊ทผ๋ฒ•\n",
100
+ " ์ธ์ˆ˜๊ณตํ†ต ์งˆ๋ณ‘ ๊ฐ์‹œ\n",
101
+ " ํ™˜๊ฒฝ-๋™๋ฌผ-์ธ๊ฐ„ ์ƒํ˜ธ์ž‘์šฉ\n",
102
+ " ์•ผ์ƒ๋™๋ฌผ ๋ณด์ „ ์‘์šฉ\n",
103
+ " ํ‘œ๋ณธ ๋ชจ๋‹ˆํ„ฐ๋ง ์ž๋™ํ™”\n",
104
+ " ๋ฉธ์ข…์œ„๊ธฐ์ข… ๋ณดํ˜ธ ์ „๋žต\n",
105
+ "```"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": null,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": []
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": []
121
+ }
122
+ ],
123
+ "metadata": {
124
+ "kernelspec": {
125
+ "display_name": ".venv",
126
+ "language": "python",
127
+ "name": "python3"
128
+ },
129
+ "language_info": {
130
+ "name": "python",
131
+ "version": "3.13.1"
132
+ }
133
+ },
134
+ "nbformat": 4,
135
+ "nbformat_minor": 2
136
+ }
web_interface.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from pathlib import Path
4
+ import base64
5
+
6
+ # PDF to HTML ๋ณ€ํ™˜๊ธฐ ํด๋ž˜์Šค ์ž„ํฌํŠธ - ์ˆ˜์ •๋œ ๋ฒ„์ „ ์‚ฌ์šฉ
7
+ from convert import PDFToHTMLConverter
8
+
9
+
10
+ def convert_pdf_to_html(pdf_file):
11
+ """PDF ํŒŒ์ผ์„ HTML๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜"""
12
+ try:
13
+ # ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
14
+ current_dir = Path.cwd()
15
+ temp_dir = current_dir / ".temp"
16
+
17
+ # PDF ๋ฐ์ดํ„ฐ ์ค€๋น„
18
+ if hasattr(pdf_file, "name"): # Gradio ํŒŒ์ผ ๊ฐ์ฒด์ธ ๊ฒฝ์šฐ
19
+ with open(pdf_file.name, "rb") as f:
20
+ pdf_data = f.read()
21
+ else: # ์ด๋ฏธ ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ์ธ ๊ฒฝ์šฐ
22
+ pdf_data = pdf_file
23
+
24
+ # ๊ณ ์ • ๊ฒฝ๋กœ์— PDF ์ €์žฅ
25
+ pdf_input_dir = temp_dir / "temp_input_pdf"
26
+ pdf_input_dir.mkdir(exist_ok=True, parents=True)
27
+ pdf_path = pdf_input_dir / "current.pdf"
28
+
29
+ # PDF ์ €์žฅ
30
+ with open(pdf_path, "wb") as f:
31
+ f.write(pdf_data)
32
+
33
+ print(f"PDF ์ €์žฅ ์™„๋ฃŒ: {pdf_path}")
34
+
35
+ # PDF ๋ณ€ํ™˜ - ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML๋กœ ๋ถ„๋ฆฌ
36
+ converter = PDFToHTMLConverter(str(pdf_path))
37
+ text_html_path, media_html_path = converter.convert()
38
+
39
+ print(f"HTML ๋ณ€ํ™˜ ์™„๋ฃŒ: {text_html_path}, {media_html_path}")
40
+
41
+ # HTML ํŒŒ์ผ ์ฝ๊ธฐ
42
+ with open(text_html_path, "r", encoding="utf-8") as f:
43
+ text_html_content = f.read()
44
+
45
+ with open(media_html_path, "r", encoding="utf-8") as f:
46
+ media_html_content = f.read()
47
+
48
+ # ์ด๋ฏธ์ง€๋ฅผ Base64๋กœ ์ธ์ฝ”๋”ฉํ•˜์—ฌ HTML์— ์ง์ ‘ ํฌํ•จ
49
+ img_dir_path = temp_dir / "temp_output_html" / "images"
50
+ if img_dir_path.exists():
51
+ print(f"์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ: {img_dir_path}")
52
+ for img_file in img_dir_path.glob("*.*"):
53
+ try:
54
+ rel_path = f"images/{img_file.name}"
55
+ print(f"์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์ค‘: {img_file}")
56
+
57
+ # ์ด๋ฏธ์ง€ ํŒŒ์ผ ์ฝ๊ธฐ
58
+ with open(img_file, "rb") as f:
59
+ encoded_string = base64.b64encode(f.read()).decode("utf-8")
60
+
61
+ # ์ด๋ฏธ์ง€ ํƒ€์ž…์— ๋”ฐ๋ผ MIME ํƒ€์ž… ์„ค์ •
62
+ ext = img_file.suffix.lower()[1:] # .png -> png
63
+ mime_type = {
64
+ "png": "image/png",
65
+ "jpg": "image/jpeg",
66
+ "jpeg": "image/jpeg",
67
+ "gif": "image/gif",
68
+ "svg": "image/svg+xml",
69
+ }.get(ext, "image/png")
70
+
71
+ # Base64 ์ด๋ฏธ์ง€ URL ์ƒ์„ฑ
72
+ data_url = f"data:{mime_type};base64,{encoded_string}"
73
+
74
+ # ๋ฏธ๋””์–ด HTML ๋‚ด์šฉ์—์„œ ์ด๋ฏธ์ง€ ๊ฒฝ๋กœ ๊ต์ฒด
75
+ original_pattern = f'src="{rel_path}"'
76
+ replacement = f'src="{data_url}"'
77
+
78
+ if original_pattern in media_html_content:
79
+ media_html_content = media_html_content.replace(
80
+ original_pattern, replacement
81
+ )
82
+ print(f"์ด๋ฏธ์ง€ {img_file.name} Base64 ์ธ์ฝ”๋”ฉ ์™„๋ฃŒ")
83
+ else:
84
+ print(
85
+ f"๊ฒฝ๊ณ : ์ด๋ฏธ์ง€ ๊ฒฝ๋กœ '{rel_path}'๋ฅผ HTML์—์„œ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"
86
+ )
87
+
88
+ except Exception as e:
89
+ print(f"์ด๋ฏธ์ง€ {img_file.name} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
90
+ else:
91
+ print(f"์ด๋ฏธ์ง€ ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Œ: {img_dir_path}")
92
+
93
+ # ์Šคํฌ๋กค ๊ฐ€๋Šฅํ•œ ์ปจํ…Œ์ด๋„ˆ๋กœ HTML ์ปจํ…์ธ  ๋ž˜ํ•‘
94
+ text_html_with_style = f"""
95
+ <div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;">
96
+ {text_html_content}
97
+ </div>
98
+ """
99
+
100
+ media_html_with_style = f"""
101
+ <div style="width: 100%; height: 800px; overflow-y: auto; border: 1px solid #444; background-color: #2a2a2a;">
102
+ {media_html_content}
103
+ </div>
104
+ """
105
+
106
+ print("HTML ๋‚ด์šฉ ์ค€๋น„ ์™„๋ฃŒ")
107
+
108
+ # ํ…์ŠคํŠธ HTML๊ณผ ๋ฏธ๋””์–ด HTML ๋ฐ˜ํ™˜
109
+ return text_html_with_style, media_html_with_style
110
+
111
+ except Exception as e:
112
+ import traceback
113
+
114
+ error_details = traceback.format_exc()
115
+ print(f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}\n{error_details}")
116
+ error_html = f"<h1>์˜ค๋ฅ˜ ๋ฐœ์ƒ</h1><p>{str(e)}</p><pre>{error_details}</pre>"
117
+ return error_html, error_html
118
+
119
+
120
+ def launch_web_interface():
121
+ """Gradio ์›น ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰"""
122
+ # CSS ์Šคํƒ€์ผ
123
+ css = """
124
+ /* ์ „์ฒด ๋ ˆ์ด์•„์›ƒ */
125
+ body, .gradio-container {
126
+ margin: 0 !important;
127
+ padding: 0 !important;
128
+ width: 100% !important;
129
+ max-width: none !important;
130
+ background-color: #1f1f1f;
131
+ }
132
+
133
+ /* ํ—ค๋” ์˜์—ญ */
134
+ .header-area {
135
+ background-color: #2a2a2a;
136
+ padding: 1rem;
137
+ border-bottom: 1px solid #444;
138
+ margin-bottom: 1rem;
139
+ }
140
+
141
+ /* ์—…๋กœ๋“œ ์˜์—ญ */
142
+ .upload-area {
143
+ background-color: #2a2a2a;
144
+ padding: 1rem;
145
+ border-radius: 5px;
146
+ margin-bottom: 1rem;
147
+ }
148
+
149
+ /* HTML ๋ทฐ์–ด ์ปจํ…Œ์ด๋„ˆ */
150
+ .html-columns {
151
+ display: flex;
152
+ gap: 20px;
153
+ }
154
+
155
+ .html-column {
156
+ flex: 1;
157
+ min-width: 0;
158
+ }
159
+
160
+ /* HTML ๋ทฐ์–ด */
161
+ .html-display {
162
+ min-height: 800px !important;
163
+ width: 100% !important;
164
+ background-color: #2a2a2a !important;
165
+ }
166
+
167
+ /* HTML ๋‚ด์šฉ์˜ ํ…์ŠคํŠธ ์ƒ‰์ƒ */
168
+ .html-display * {
169
+ color: #ffffff !important;
170
+ }
171
+
172
+ /* HTML ๋‚ด์˜ ํ‘œ ์Šคํƒ€์ผ */
173
+ .html-display table {
174
+ background-color: #333 !important;
175
+ border: 1px solid #555 !important;
176
+ }
177
+
178
+ .html-display td,
179
+ .html-display th {
180
+ border: 1px solid #555 !important;
181
+ color: #fff !important;
182
+ }
183
+
184
+ /* ๋ฒ„ํŠผ ์Šคํƒ€์ผ */
185
+ .convert-button {
186
+ background-color: #E67E22 !important;
187
+ border: none !important;
188
+ }
189
+
190
+ /* ํƒ€์ดํ‹€ ํ…์ŠคํŠธ */
191
+ .title-text {
192
+ color: white !important;
193
+ margin: 0 !important;
194
+ padding: 0 !important;
195
+ }
196
+
197
+ /* ์„ค๋ช… ํ…์ŠคํŠธ */
198
+ .description-text {
199
+ color: #aaa !important;
200
+ margin-top: 0.5rem !important;
201
+ }
202
+
203
+ /* ์ปฌ๋Ÿผ ์ œ๋ชฉ */
204
+ .column-title {
205
+ color: white !important;
206
+ margin-bottom: 0.5rem !important;
207
+ }
208
+
209
+ /* ํ‘ธํ„ฐ */
210
+ .footer-area {
211
+ margin-top: 2rem;
212
+ text-align: center;
213
+ color: #888;
214
+ padding: 1rem;
215
+ }
216
+ """
217
+
218
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค
219
+ with gr.Blocks(css=css, theme=gr.themes.Default()) as demo:
220
+ # ํ—ค๋” ์„น์…˜
221
+ with gr.Column(elem_classes="header-area"):
222
+ gr.Markdown("# PDF to HTML ๋ณ€ํ™˜๊ธฐ", elem_classes="title-text")
223
+ gr.Markdown(
224
+ "PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์—ฌ ํ…์ŠคํŠธ์™€ ๋ฏธ๋””์–ด๋กœ ๋ถ„๋ฆฌ๋œ HTML์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.",
225
+ elem_classes="description-text",
226
+ )
227
+
228
+ # ์—…๋กœ๋“œ ์„น์…˜
229
+ with gr.Column(elem_classes="upload-area"):
230
+ # ํŒŒ์ผ ์—…๋กœ๋“œ ๋ฐ ๋ณ€ํ™˜ ๋ฒ„ํŠผ
231
+ with gr.Row():
232
+ pdf_input = gr.File(
233
+ label="PDF ํŒŒ์ผ ์—…๋กœ๋“œ", type="binary", elem_id="pdf-upload"
234
+ )
235
+ convert_btn = gr.Button(
236
+ "๋ณ€ํ™˜ํ•˜๊ธฐ", variant="primary", elem_classes="convert-button"
237
+ )
238
+
239
+ # ์ƒํƒœ ํ‘œ์‹œ
240
+ status_output = gr.Textbox(
241
+ label="์ƒํƒœ", value="๋Œ€๊ธฐ ์ค‘...", interactive=False
242
+ )
243
+
244
+ # HTML ๋ทฐ์–ด ์˜์—ญ (๋‘ ์—ด๋กœ ๊ตฌ์„ฑ)
245
+ with gr.Column(visible=False) as html_output_area:
246
+ with gr.Row(elem_classes="html-columns"):
247
+ # ์™ผ์ชฝ ์—ด - ํ…์ŠคํŠธ HTML
248
+ with gr.Column(elem_classes="html-column"):
249
+ gr.Markdown("### ํ…์ŠคํŠธ ๋‚ด์šฉ", elem_classes="column-title")
250
+ text_html_viewer = gr.HTML(
251
+ label="ํ…์ŠคํŠธ HTML",
252
+ elem_id="text-html-viewer",
253
+ elem_classes="html-display",
254
+ )
255
+
256
+ # ์˜ค๋ฅธ์ชฝ ์—ด - ๋ฏธ๋””์–ด HTML
257
+ with gr.Column(elem_classes="html-column"):
258
+ gr.Markdown("### ํ‘œ ๋ฐ ์ด๋ฏธ์ง€", elem_classes="column-title")
259
+ media_html_viewer = gr.HTML(
260
+ label="๋ฏธ๋””์–ด HTML",
261
+ elem_id="media-html-viewer",
262
+ elem_classes="html-display",
263
+ )
264
+
265
+ # ํ‘ธํ„ฐ
266
+ with gr.Column(elem_classes="footer-area"):
267
+ gr.Markdown("ยฉ 2025 pdf2html")
268
+
269
+ # ๋ณ€ํ™˜ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
270
+ def process_conversion(pdf_file):
271
+ if pdf_file is None:
272
+ return (
273
+ gr.update(visible=False),
274
+ "<h1 style='color:white;'>PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”</h1>",
275
+ "<h1 style='color:white;'>PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”</h1>",
276
+ "PDF ํŒŒ์ผ์ด ์—…๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
277
+ )
278
+
279
+ try:
280
+ # ๋ณ€ํ™˜ ํ•จ์ˆ˜ ํ˜ธ์ถœ
281
+ text_html, media_html = convert_pdf_to_html(pdf_file)
282
+
283
+ # HTML ๋‚ด์šฉ ๋””๋ฒ„๊น…
284
+ print(f"ํ…์ŠคํŠธ HTML ๊ธธ์ด: {len(text_html)} ๋ฐ”์ดํŠธ")
285
+ print(f"๋ฏธ๋””์–ด HTML ๊ธธ์ด: {len(media_html)} ๋ฐ”์ดํŠธ")
286
+
287
+ # HTML ํ‘œ์‹œ ์˜์—ญ ๋ณด์ด๊ธฐ ๋ฐ ๋‚ด์šฉ ์—…๋ฐ์ดํŠธ
288
+ return gr.update(visible=True), text_html, media_html, "๋ณ€ํ™˜ ์™„๋ฃŒ!"
289
+ except Exception as e:
290
+ import traceback
291
+
292
+ error_details = traceback.format_exc()
293
+ print(f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {str(e)}\n{error_details}")
294
+ error_html = f"<h1 style='color:red;'>์˜ค๋ฅ˜ ๋ฐœ์ƒ</h1><p style='color:red;'>{str(e)}</p>"
295
+ return (
296
+ gr.update(visible=False),
297
+ error_html,
298
+ error_html,
299
+ f"์˜ค๋ฅ˜: {str(e)}",
300
+ )
301
+
302
+ # ๋ณ€ํ™˜ ๋ฒ„ํŠผ ํด๋ฆญ ์ด๋ฒคํŠธ
303
+ convert_btn.click(
304
+ fn=process_conversion,
305
+ inputs=pdf_input,
306
+ outputs=[
307
+ html_output_area,
308
+ text_html_viewer,
309
+ media_html_viewer,
310
+ status_output,
311
+ ],
312
+ )
313
+
314
+ # ๋ ˆ์ด์•„์›ƒ ๋ฌธ์ œ ํ•ด๊ฒฐ์„ ์œ„ํ•œ JavaScript
315
+ demo.load(
316
+ js="""
317
+ function fixLayout() {
318
+ // HTML ๋ทฐ์–ด ์ปจํ…Œ์ด๋„ˆ ํ™•์ธ
319
+ const htmlColumns = document.querySelector('.html-columns');
320
+ if (htmlColumns) {
321
+ // ์ปจํ…Œ์ด๋„ˆ์˜ ๋„ˆ๋น„ ๊ท ๋“ฑํ•˜๊ฒŒ ๋งž์ถ”๊ธฐ
322
+ const columns = htmlColumns.querySelectorAll('.html-column');
323
+ columns.forEach(column => {
324
+ column.style.flex = '1';
325
+ column.style.minWidth = '0';
326
+ });
327
+ }
328
+
329
+ // ํ…์ŠคํŠธ ์ƒ‰์ƒ ๊ฐ•์ œ ์„ค์ •
330
+ const textViewer = document.getElementById('text-html-viewer');
331
+ const mediaViewer = document.getElementById('media-html-viewer');
332
+
333
+ function forceTextColor(element) {
334
+ if (!element) return;
335
+
336
+ // iframe ๋‚ด๋ถ€ ๋ฌธ์„œ์— ์ ‘๊ทผ
337
+ try {
338
+ const iframes = element.querySelectorAll('iframe');
339
+ iframes.forEach(iframe => {
340
+ if (iframe.contentDocument) {
341
+ const allElements = iframe.contentDocument.querySelectorAll('*');
342
+ allElements.forEach(el => {
343
+ if (el.tagName !== 'IMG') {
344
+ el.style.color = '#ffffff';
345
+ }
346
+ });
347
+
348
+ // ๋ฐฐ๊ฒฝ์ƒ‰ ์„ค์ •
349
+ const body = iframe.contentDocument.body;
350
+ if (body) {
351
+ body.style.backgroundColor = '#2a2a2a';
352
+ }
353
+ }
354
+ });
355
+ } catch (e) {
356
+ console.error('iframe ์ ‘๊ทผ ์ค‘ ์˜ค๋ฅ˜:', e);
357
+ }
358
+
359
+ // ์ง์ ‘ ๋ฌธ์„œ ๋‚ด ์š”์†Œ์— ์ƒ‰์ƒ ์„ค์ •
360
+ const allTextElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, span, div, a, li, td, th');
361
+ allTextElements.forEach(el => {
362
+ el.style.color = '#ffffff';
363
+ });
364
+ }
365
+
366
+ forceTextColor(textViewer);
367
+ forceTextColor(mediaViewer);
368
+
369
+ // ์ด๋ฏธ์ง€ ํ‘œ์‹œ ํ™•์ธ
370
+ if (mediaViewer) {
371
+ const images = mediaViewer.querySelectorAll('img');
372
+ console.log(`๋ฏธ๋””์–ด ๋ทฐ์–ด์—์„œ ์ด๋ฏธ์ง€ ${images.length}๊ฐœ ๋ฐœ๊ฒฌ`);
373
+
374
+ images.forEach((img, index) => {
375
+ // ์ด๋ฏธ์ง€ ๋กœ๋“œ ์ƒํƒœ ํ™•์ธ
376
+ console.log(`์ด๋ฏธ์ง€ ${index + 1} ๋กœ๋“œ ์ƒํƒœ: ${img.complete ? '์™„๋ฃŒ' : '๋กœ๋”ฉ ์ค‘'}`);
377
+ if (img.complete && img.naturalWidth === 0) {
378
+ console.log(`์ด๋ฏธ์ง€ ${index + 1} ๋กœ๋“œ ์‹คํŒจ`);
379
+ }
380
+ });
381
+ }
382
+ }
383
+
384
+ // ํŽ˜์ด์ง€ ๋กœ๋“œ ์‹œ ๋ ˆ์ด์•„์›ƒ ์กฐ์ •
385
+ window.addEventListener('load', function() {
386
+ setTimeout(fixLayout, 1000);
387
+ setTimeout(fixLayout, 3000);
388
+ setTimeout(fixLayout, 5000); // ๋” ๊ธด ์‹œ๊ฐ„ ํ›„์—๋„ ํ•œ ๋ฒˆ ๋” ์‹คํ–‰
389
+ });
390
+
391
+ // MutationObserver๋กœ DOM ๋ณ€๊ฒฝ ๊ฐ์ง€
392
+ const observer = new MutationObserver(mutations => {
393
+ setTimeout(fixLayout, 500);
394
+ });
395
+
396
+ // ํŽ˜์ด์ง€ ๋กœ๋“œ ํ›„ Observer ์‹œ์ž‘
397
+ window.addEventListener('load', () => {
398
+ observer.observe(document.body, {
399
+ childList: true,
400
+ subtree: true,
401
+ attributes: true
402
+ });
403
+
404
+ // ์Šคํƒ€์ผ ์š”์†Œ ์ง์ ‘ ์ถ”๊ฐ€
405
+ const style = document.createElement('style');
406
+ style.textContent = `
407
+ .html-display * {
408
+ color: #ffffff !important;
409
+ }
410
+ .html-display {
411
+ background-color: #2a2a2a !important;
412
+ }
413
+ `;
414
+ document.head.appendChild(style);
415
+ });
416
+ """
417
+ )
418
+
419
+ # ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
420
+ demo.launch(share=False, inbrowser=True, show_api=False)
421
+
422
+
423
+ if __name__ == "__main__":
424
+ launch_web_interface()