Spaces:
Running
Running
MohitGupta41 commited on
Commit ·
4b9a655
1
Parent(s): a75e6eb
Initial Commit
Browse files- .gitattributes +4 -35
- .gitignore +3 -0
- .streamlit/config.toml +3 -0
- Dockerfile +29 -0
- Notebook/notebook.ipynb +3 -0
- Notebook/notebook2.ipynb +3 -0
- Notebook/notebook3.ipynb +3 -0
- Notebook/notebook4.ipynb +3 -0
- Notebook/notebook5.ipynb +3 -0
- Notebook/temp/ocr_fixed.pdf +3 -0
- PDF_Translate/__init__.py +3 -0
- PDF_Translate/cli.py +3 -0
- PDF_Translate/constants.py +3 -0
- PDF_Translate/hybrid.py +3 -0
- PDF_Translate/ocr.py +3 -0
- PDF_Translate/overlay.py +3 -0
- PDF_Translate/pipeline.py +3 -0
- PDF_Translate/textlayer.py +3 -0
- PDF_Translate/utils.py +3 -0
- app.py +3 -0
- assets/fonts/Hind-Regular.ttf +3 -0
- assets/fonts/Karma-Regular.ttf +3 -0
- assets/fonts/Mukta-Regular.ttf +3 -0
- assets/fonts/NotoSans-Bold.ttf +3 -0
- assets/fonts/NotoSans-Regular.ttf +3 -0
- assets/fonts/NotoSansDevanagari-Bold.ttf +3 -0
- assets/fonts/NotoSansDevanagari-Regular.ttf +3 -0
- assets/fonts/TiroDevanagariHindi-Regular.ttf +3 -0
- assets/samples/Test1.pdf +3 -0
- assets/samples/Test1_translated.pdf +3 -0
- assets/samples/Test2.pdf +3 -0
- assets/samples/Test2_translated.pdf +3 -0
- assets/samples/Test3.pdf +3 -0
- assets/samples/Test3_translated.pdf +3 -0
- requirements.txt +14 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,4 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
-
*.
|
| 4 |
-
*.
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.py filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.ipynb filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.ttf filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pycache/
|
| 2 |
+
*.pyc
|
| 3 |
+
.venv
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base="dark"
|
| 3 |
+
primaryColor="#3B82F6"
|
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# System deps for ocrmypdf + fonts + runtime
|
| 4 |
+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
| 5 |
+
ocrmypdf tesseract-ocr ghostscript qpdf pngquant unpaper \
|
| 6 |
+
fonts-noto fonts-noto-cjk fonts-noto-unhinted fonts-noto-color-emoji \
|
| 7 |
+
libglib2.0-0 libgl1 \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# app code
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
COPY requirements.txt ./requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# streamlit port on Spaces is 7860 by convention
|
| 18 |
+
ENV PORT=7860
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
# Ensure output dirs exist
|
| 22 |
+
RUN mkdir -p output_pdfs temp
|
| 23 |
+
|
| 24 |
+
# Streamlit config
|
| 25 |
+
ENV STREAMLIT_SERVER_PORT=7860
|
| 26 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 27 |
+
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 28 |
+
|
| 29 |
+
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
Notebook/notebook.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b187df0cfb55d0517d13ac484cbb920e229d292d54f7db2114220d143667ee04
|
| 3 |
+
size 262601
|
Notebook/notebook2.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8428c01520dce3e2098c0e0328dd69d8280ceeba6b46ee8e4fbd76211e79ed5a
|
| 3 |
+
size 399743
|
Notebook/notebook3.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b778339407913f774c9a9c5d58cd2045a1523cf61dc2499d9d1c384852a64a95
|
| 3 |
+
size 270908
|
Notebook/notebook4.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:796016ae706028cd1ee126978537a6b8100f202837078b975554cdbf74be2088
|
| 3 |
+
size 45168
|
Notebook/notebook5.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f29def08df2f8a8067fd246a4f73dcce05cae4606ad609e354e048926517e6f3
|
| 3 |
+
size 149852
|
Notebook/temp/ocr_fixed.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76985c22eabb5c76866b2a14a4affae1305380d7288b8c9a3cc6321514e56f81
|
| 3 |
+
size 8202753
|
PDF_Translate/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1386ed2d1dd2ef590be5ef2ea86cbff8456484e1a84c554fddbbb1f108fab526
|
| 3 |
+
size 24
|
PDF_Translate/cli.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06be90dd5253873a72b614b08d5eaaff8289e59be9d33f6c6c08d6cf2bcb3b1f
|
| 3 |
+
size 6237
|
PDF_Translate/constants.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:327964f0b072424f8ac46143b56c636d65ab148a4209ba6f53f524358144ef27
|
| 3 |
+
size 1106
|
PDF_Translate/hybrid.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf57e2185fafb85f9bb158d1c0283074826ca02c002b931d4bdabadd2aaec040
|
| 3 |
+
size 6063
|
PDF_Translate/ocr.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dceea223700e784f853e5ecca04031a0a8c32570ab9cf71687f090a47296047b
|
| 3 |
+
size 2343
|
PDF_Translate/overlay.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9aa45d7e145ea379fa141c28efccbfaaa08e04999e27ef4b3039ef2dab4cfd0
|
| 3 |
+
size 10788
|
PDF_Translate/pipeline.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5db8156e416cd4da5dd2e9b7dd831c90cc632ef53d20499081173c5e9443f30
|
| 3 |
+
size 18824
|
PDF_Translate/textlayer.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3970161c69a797e76c4a825a7d8414964ce6704db0ffac7d65a4e6327193e928
|
| 3 |
+
size 11925
|
PDF_Translate/utils.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:232895507f730581719776fec888d9f7f07bfccb64475340fe596835c64ca811
|
| 3 |
+
size 6267
|
app.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb01b0a35bdfc3534a56719c037bb90afaa0ecf35974bef57e7d87514f211f6c
|
| 3 |
+
size 12251
|
assets/fonts/Hind-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01de158022f53077b52303e46de3b0ab5fb245222a7ffe25a2a57fdd9e969162
|
| 3 |
+
size 299532
|
assets/fonts/Karma-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc3457d2867b67ee956d95206dbc831e57b26975219d3d4e68d60f51f67b4b55
|
| 3 |
+
size 339812
|
assets/fonts/Mukta-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2958e4af564507df2a856164df6f9978dacb03f999a4f34a0c269dc8a4de9688
|
| 3 |
+
size 432248
|
assets/fonts/NotoSans-Bold.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c976e4b1b99edc88775377fcc21692ca4bfa46b6d6ca6522bfda505b28ff9d6a
|
| 3 |
+
size 575740
|
assets/fonts/NotoSans-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b85c38ecea8a7cfb39c24e395a4007474fa5a4fc864f6ee33309eb4948d232d5
|
| 3 |
+
size 569208
|
assets/fonts/NotoSansDevanagari-Bold.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19cc835a064c4af63e3c20feb54f5bf51dc25ffa52c0f493a23904572af8b26e
|
| 3 |
+
size 225748
|
assets/fonts/NotoSansDevanagari-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:385e78e6359a9d88a0f243d53b1209d7548361ba2194e2b9ec779bcaa7e8949d
|
| 3 |
+
size 219212
|
assets/fonts/TiroDevanagariHindi-Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2abcb4d352f0bfab91632df5d9c8173882073c182e662fa731a5a738e6681d7
|
| 3 |
+
size 423224
|
assets/samples/Test1.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38b316e937344ff9c1d536d9809f803df6c1130f8cbb3a6102d5fed8b029492c
|
| 3 |
+
size 245658
|
assets/samples/Test1_translated.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f80f2a1e0dde378590b7f2387d8929fbe65c6e8dec8b59d6c9926a2df2030bd
|
| 3 |
+
size 448186
|
assets/samples/Test2.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84cc71827279b3d4edfe6b3b2c4a67a01d58d3ab08cd9fe4eb256870c75fabf4
|
| 3 |
+
size 462464
|
assets/samples/Test2_translated.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9a2d58323263978b6f53d767642cfafc50003b9a81cc670799790328a341120
|
| 3 |
+
size 494623
|
assets/samples/Test3.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b43902250834482ca3bbbeb10547773a25f595de40262a004812a69b17e405a6
|
| 3 |
+
size 1251666
|
assets/samples/Test3_translated.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:939e0785b078e4830e2847ba32d984ca31bb521d45f1ebf3a3aeea4562b77dc7
|
| 3 |
+
size 650922
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.35.0
|
| 2 |
+
pymupdf>=1.24.0
|
| 3 |
+
pillow>=10.3.0
|
| 4 |
+
# pytesseract>=0.3.10
|
| 5 |
+
|
| 6 |
+
ocrmypdf
|
| 7 |
+
googletrans
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
streamlit==1.38.0
|
| 11 |
+
pymupdf==1.24.9
|
| 12 |
+
# googletrans==4.0.0rc1
|
| 13 |
+
# Pillow==10.4.0
|
| 14 |
+
# nest_asyncio==1.6.0
|