Spaces:
Running
Running
github-actions[bot] commited on
Commit ·
8e52fc5
0
Parent(s):
Sync from GitHub
Browse files- .gitattributes +35 -0
- .github/workflows/sync-hf.yml +37 -0
- .gitignore +211 -0
- .streamlit/config.toml +3 -0
- AGENTS.md +42 -0
- Dockerfile +23 -0
- README.md +128 -0
- extractor.py +409 -0
- requirements.txt +7 -0
- src/streamlit_app.py +332 -0
- templates/corporate_tax_returns.json +31 -0
- templates/diplomas.json +51 -0
- templates/employment_letter.json +66 -0
- templates/i129_h1b_petition.json +224 -0
- templates/i_94.json +14 -0
- templates/marriage_certificate.json +70 -0
- templates/passport.json +20 -0
- templates/proof_of_in_country_status.json +69 -0
- templates/resume.json +137 -0
- templates/school_transcripts.json +55 -0
- templates/us_visa.json +21 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/sync-hf.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
sync:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- name: Checkout
|
| 13 |
+
uses: actions/checkout@v4
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
- name: Push to Hugging Face Space
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
HF_SPACE: pradyten/pdf-extractor
|
| 20 |
+
run: |
|
| 21 |
+
set -euo pipefail
|
| 22 |
+
if [ -z "${HF_TOKEN}" ]; then
|
| 23 |
+
echo "HF_TOKEN is not set."
|
| 24 |
+
exit 1
|
| 25 |
+
fi
|
| 26 |
+
sync_dir="$(mktemp -d)"
|
| 27 |
+
git ls-files -z | tar --null -T - -cf - | tar -xf - -C "${sync_dir}"
|
| 28 |
+
find "${sync_dir}" -type f -name "*.pdf" -delete
|
| 29 |
+
rm -rf "${sync_dir}/sample"
|
| 30 |
+
cd "${sync_dir}"
|
| 31 |
+
git init
|
| 32 |
+
git config user.name "github-actions[bot]"
|
| 33 |
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 34 |
+
git add .
|
| 35 |
+
git commit -m "Sync from GitHub"
|
| 36 |
+
git remote add hf "https://user:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE}"
|
| 37 |
+
git push --force hf HEAD:main
|
.gitignore
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
*.pdf
|
| 210 |
+
|
| 211 |
+
!sample/*.pdf
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
enableCORS = false
|
| 3 |
+
enableXsrfProtection = false
|
AGENTS.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Repository Guidelines
|
| 2 |
+
|
| 3 |
+
## Project Structure & Module Organization
|
| 4 |
+
- `extractor.py` contains PDF rendering, template selection, and OpenAI calls.
|
| 5 |
+
- `templates/` holds JSON extraction templates referenced by `TEMPLATE_REGISTRY`.
|
| 6 |
+
- `src/streamlit_app.py` is the Hugging Face Space UI entrypoint.
|
| 7 |
+
- `Dockerfile` builds the Space image (Streamlit on port 8501).
|
| 8 |
+
- `.streamlit/config.toml` contains Space-friendly Streamlit server settings.
|
| 9 |
+
- `README.md` includes Space metadata front matter and usage notes.
|
| 10 |
+
- The UI relies on filename keywords to select templates (see `TEMPLATE_REGISTRY`).
|
| 11 |
+
- Sample PDFs are fetched from the HF dataset set by `SAMPLE_DATASET_REPO`.
|
| 12 |
+
|
| 13 |
+
## Build, Test, and Development Commands
|
| 14 |
+
- Install dependencies with `python -m pip install -r requirements.txt`.
|
| 15 |
+
- Local CLI extraction prompts for a PDF path and prints JSON:
|
| 16 |
+
- `python extractor.py`
|
| 17 |
+
- Run the Space UI locally:
|
| 18 |
+
- `streamlit run src/streamlit_app.py`
|
| 19 |
+
- Quick import sanity check:
|
| 20 |
+
- `python -c "import extractor; print(extractor.DEFAULT_MODEL)"`
|
| 21 |
+
|
| 22 |
+
## Coding Style & Naming Conventions
|
| 23 |
+
- Keep 2-space indentation in `extractor.py`.
|
| 24 |
+
- Use snake_case for functions/variables, UPPER_SNAKE for constants, and add type hints for new functions.
|
| 25 |
+
- Template JSON filenames should be snake_case and registered via lowercase filename keywords in `TEMPLATE_REGISTRY`.
|
| 26 |
+
|
| 27 |
+
## Testing Guidelines
|
| 28 |
+
- No automated test suite exists yet. If adding tests, use `pytest` under `tests/`.
|
| 29 |
+
- Validate that model output matches the exact template schema and that filename keywords map to the right template.
|
| 30 |
+
|
| 31 |
+
## Commit & Pull Request Guidelines
|
| 32 |
+
- No established commit convention; use short, imperative subjects.
|
| 33 |
+
- PRs should include the document type, template files touched, example filename keyword, and any config/env changes.
|
| 34 |
+
|
| 35 |
+
## Security & Configuration Tips
|
| 36 |
+
- Set `OPENAI_API_KEY` for local runs and the Space; optionally override `EXTRACTOR_MODEL_ALIAS`.
|
| 37 |
+
- Avoid committing sensitive PDFs or output data; use redacted samples for demos.
|
| 38 |
+
|
| 39 |
+
## Automation
|
| 40 |
+
- `.github/workflows/sync-hf.yml` pushes `main` to the HF Space on each commit using `HF_TOKEN`.
|
| 41 |
+
- Treat GitHub as the source of truth; direct edits on HF may be overwritten.
|
| 42 |
+
- The workflow force-pushes a fresh snapshot to avoid blocked legacy binaries in history.
|
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.13.5-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
build-essential \
|
| 7 |
+
curl \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY requirements.txt ./
|
| 12 |
+
COPY src/ ./src/
|
| 13 |
+
COPY .streamlit/ ./.streamlit/
|
| 14 |
+
COPY extractor.py ./
|
| 15 |
+
COPY templates/ ./templates/
|
| 16 |
+
|
| 17 |
+
RUN pip3 install -r requirements.txt
|
| 18 |
+
|
| 19 |
+
EXPOSE 8501
|
| 20 |
+
|
| 21 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 22 |
+
|
| 23 |
+
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Pdf Extractor
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
tags:
|
| 9 |
+
- streamlit
|
| 10 |
+
pinned: false
|
| 11 |
+
short_description: pdf_extractor
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# PDF-to-JSON Extractor with AI
|
| 15 |
+
|
| 16 |
+
Intelligent PDF document parser that extracts structured JSON data using OpenAI's GPT models and computer vision.
|
| 17 |
+
|
| 18 |
+
## 📋 Table of Contents
|
| 19 |
+
- [Overview](#overview)
|
| 20 |
+
- [Features](#features)
|
| 21 |
+
- [Technology Stack](#technology-stack)
|
| 22 |
+
- [Installation](#installation)
|
| 23 |
+
- [Usage](#usage)
|
| 24 |
+
- [Configuration](#configuration)
|
| 25 |
+
- [Author](#author)
|
| 26 |
+
|
| 27 |
+
## 🎯 Overview
|
| 28 |
+
|
| 29 |
+
This application converts PDF documents into structured JSON format using:
|
| 30 |
+
- **OpenAI GPT-4 Vision**: For intelligent content extraction
|
| 31 |
+
- **Template-based extraction**: Customizable JSON schemas for different document types
|
| 32 |
+
- **Streamlit UI**: Interactive web interface for easy PDF processing
|
| 33 |
+
- **Docker support**: Containerized deployment for production environments
|
| 34 |
+
|
| 35 |
+
Perfect for automating data extraction from resumes, invoices, forms, and other structured documents.
|
| 36 |
+
|
| 37 |
+
## ✨ Features
|
| 38 |
+
|
| 39 |
+
- **AI-Powered Extraction**: Uses GPT-4 Vision to understand document structure
|
| 40 |
+
- **Template System**: Pre-configured JSON templates for common document types
|
| 41 |
+
- **Batch Processing**: Handle multiple PDFs efficiently
|
| 42 |
+
- **Image Preview**: Visual confirmation of PDF pages before extraction
|
| 43 |
+
- **Format Validation**: Ensures extracted JSON matches defined schema
|
| 44 |
+
- **Hugging Face Spaces**: Ready for cloud deployment
|
| 45 |
+
|
| 46 |
+
## 🛠 Technology Stack
|
| 47 |
+
|
| 48 |
+
- **Python 3.9+** - Primary programming language
|
| 49 |
+
- **OpenAI API** - GPT-4 Vision for intelligent extraction
|
| 50 |
+
- **pypdfium2** - PDF rendering and image conversion
|
| 51 |
+
- **Streamlit** - Interactive web UI framework
|
| 52 |
+
- **Pillow (PIL)** - Image processing
|
| 53 |
+
- **Pandas** - Data manipulation
|
| 54 |
+
|
| 55 |
+
## 🚀 Installation
|
| 56 |
+
|
| 57 |
+
### Prerequisites
|
| 58 |
+
- Python 3.9 or higher
|
| 59 |
+
- OpenAI API key ([Get one here](https://platform.openai.com/api-keys))
|
| 60 |
+
|
| 61 |
+
### Setup
|
| 62 |
+
|
| 63 |
+
1. Clone the repository:
|
| 64 |
+
\`\`\`bash
|
| 65 |
+
git clone https://github.com/pradyten/pdf-extractor.git
|
| 66 |
+
cd pdf-extractor
|
| 67 |
+
\`\`\`
|
| 68 |
+
|
| 69 |
+
2. Install dependencies:
|
| 70 |
+
\`\`\`bash
|
| 71 |
+
pip install -r requirements.txt
|
| 72 |
+
\`\`\`
|
| 73 |
+
|
| 74 |
+
3. Configure OpenAI API key:
|
| 75 |
+
\`\`\`bash
|
| 76 |
+
export OPENAI_API_KEY='your-api-key-here'
|
| 77 |
+
\`\`\`
|
| 78 |
+
|
| 79 |
+
## 💻 Usage
|
| 80 |
+
|
| 81 |
+
### Command Line
|
| 82 |
+
\`\`\`bash
|
| 83 |
+
python extractor.py path/to/document.pdf
|
| 84 |
+
\`\`\`
|
| 85 |
+
|
| 86 |
+
### Streamlit Web UI
|
| 87 |
+
\`\`\`bash
|
| 88 |
+
streamlit run src/streamlit_app.py
|
| 89 |
+
\`\`\`
|
| 90 |
+
|
| 91 |
+
### Docker
|
| 92 |
+
\`\`\`bash
|
| 93 |
+
docker build -t pdf-extractor .
|
| 94 |
+
docker run -p 8501:8501 -e OPENAI_API_KEY='your-key' pdf-extractor
|
| 95 |
+
\`\`\`
|
| 96 |
+
|
| 97 |
+
## ⚙️ Configuration
|
| 98 |
+
|
| 99 |
+
Define custom templates in \`extractor.py\` for different document types (resumes, invoices, forms).
|
| 100 |
+
|
| 101 |
+
## 🎓 Use Cases
|
| 102 |
+
|
| 103 |
+
- **HR & Recruitment**: Batch process resume PDFs
|
| 104 |
+
- **Accounting**: Extract invoice data
|
| 105 |
+
- **Data Entry**: Automate form digitization
|
| 106 |
+
- **Document Management**: Convert scanned documents to searchable JSON
|
| 107 |
+
|
| 108 |
+
## 🔒 Security & Privacy
|
| 109 |
+
|
| 110 |
+
- Never commit API keys - use environment variables
|
| 111 |
+
- PDFs are processed in-memory, not stored
|
| 112 |
+
- Review OpenAI's data usage policies for compliance
|
| 113 |
+
|
| 114 |
+
## 👨💻 Author
|
| 115 |
+
|
| 116 |
+
**Pradyumn Tendulkar**
|
| 117 |
+
|
| 118 |
+
Data Science Graduate Student | ML Engineer
|
| 119 |
+
|
| 120 |
+
- GitHub: [@pradyten](https://github.com/pradyten)
|
| 121 |
+
- LinkedIn: [Pradyumn Tendulkar](https://www.linkedin.com/in/p-tendulkar/)
|
| 122 |
+
- Email: pktendulkar@wpi.edu
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
⭐ If you found this project helpful, please consider giving it a star!
|
| 127 |
+
|
| 128 |
+
📝 **License:** MIT
|
extractor.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
from typing import Dict, Any, List, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
import pypdfium2 as pdfium
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# path to templates folder (relative to this file)
|
| 12 |
+
TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
TEMPLATE_REGISTRY: Dict[str, Dict[str, str]] = {
|
| 16 |
+
# keyword in PDF filename (lowercase) : { document_type, template_file }
|
| 17 |
+
|
| 18 |
+
# Immigration forms
|
| 19 |
+
"i129": {
|
| 20 |
+
"document_type": "USCIS Form I-129 H-1B Petition",
|
| 21 |
+
"template_file": "i129_h1b_petition.json",
|
| 22 |
+
},
|
| 23 |
+
"i94": {
|
| 24 |
+
"document_type": "Form I-94 Arrival/Departure Record",
|
| 25 |
+
"template_file": "i_94.json",
|
| 26 |
+
},
|
| 27 |
+
"i-94": {
|
| 28 |
+
"document_type": "Form I-94 Arrival/Departure Record",
|
| 29 |
+
"template_file": "i_94.json",
|
| 30 |
+
},
|
| 31 |
+
"i20": {
|
| 32 |
+
"document_type": "Form I-20 Certificate of Eligibility",
|
| 33 |
+
"template_file": "proof_of_in_country_status.json",
|
| 34 |
+
},
|
| 35 |
+
"i-20": {
|
| 36 |
+
"document_type": "Form I-20 Certificate of Eligibility",
|
| 37 |
+
"template_file": "proof_of_in_country_status.json",
|
| 38 |
+
},
|
| 39 |
+
|
| 40 |
+
# Identity documents
|
| 41 |
+
"passport": {
|
| 42 |
+
"document_type": "Passport",
|
| 43 |
+
"template_file": "passport.json",
|
| 44 |
+
},
|
| 45 |
+
"visa": {
|
| 46 |
+
"document_type": "US Visa",
|
| 47 |
+
"template_file": "us_visa.json",
|
| 48 |
+
},
|
| 49 |
+
|
| 50 |
+
# Education documents
|
| 51 |
+
"transcript": {
|
| 52 |
+
"document_type": "Academic Transcript",
|
| 53 |
+
"template_file": "school_transcripts.json",
|
| 54 |
+
},
|
| 55 |
+
"diploma": {
|
| 56 |
+
"document_type": "Diploma",
|
| 57 |
+
"template_file": "diplomas.json",
|
| 58 |
+
},
|
| 59 |
+
|
| 60 |
+
# Employment documents
|
| 61 |
+
"employment letter": {
|
| 62 |
+
"document_type": "Employment Letter",
|
| 63 |
+
"template_file": "employment_letter.json",
|
| 64 |
+
},
|
| 65 |
+
"offer letter": {
|
| 66 |
+
"document_type": "Employment Letter",
|
| 67 |
+
"template_file": "employment_letter.json",
|
| 68 |
+
},
|
| 69 |
+
"offer-letter": {
|
| 70 |
+
"document_type": "Employment Letter",
|
| 71 |
+
"template_file": "employment_letter.json",
|
| 72 |
+
},
|
| 73 |
+
"offer_letter": {
|
| 74 |
+
"document_type": "Employment Letter",
|
| 75 |
+
"template_file": "employment_letter.json",
|
| 76 |
+
},
|
| 77 |
+
"employment_letter": {
|
| 78 |
+
"document_type": "Employment Letter",
|
| 79 |
+
"template_file": "employment_letter.json",
|
| 80 |
+
},
|
| 81 |
+
"employment": {
|
| 82 |
+
"document_type": "Employment Letter",
|
| 83 |
+
"template_file": "employment_letter.json",
|
| 84 |
+
},
|
| 85 |
+
"resume": {
|
| 86 |
+
"document_type": "Resume/CV",
|
| 87 |
+
"template_file": "resume.json",
|
| 88 |
+
},
|
| 89 |
+
"cv": {
|
| 90 |
+
"document_type": "Resume/CV",
|
| 91 |
+
"template_file": "resume.json",
|
| 92 |
+
},
|
| 93 |
+
|
| 94 |
+
# Tax and corporate documents
|
| 95 |
+
"fein": {
|
| 96 |
+
"document_type": "Corporate Tax Returns",
|
| 97 |
+
"template_file": "corporate_tax_returns.json",
|
| 98 |
+
},
|
| 99 |
+
"cp575": {
|
| 100 |
+
"document_type": "Corporate Tax Returns",
|
| 101 |
+
"template_file": "corporate_tax_returns.json",
|
| 102 |
+
},
|
| 103 |
+
"tax": {
|
| 104 |
+
"document_type": "Corporate Tax Returns",
|
| 105 |
+
"template_file": "corporate_tax_returns.json",
|
| 106 |
+
},
|
| 107 |
+
|
| 108 |
+
# Personal documents
|
| 109 |
+
"marriage": {
|
| 110 |
+
"document_type": "Marriage Certificate",
|
| 111 |
+
"template_file": "marriage_certificate.json",
|
| 112 |
+
},
|
| 113 |
+
"marriage_certificate": {
|
| 114 |
+
"document_type": "Marriage Certificate",
|
| 115 |
+
"template_file": "marriage_certificate.json",
|
| 116 |
+
},
|
| 117 |
+
|
| 118 |
+
# Proof of status
|
| 119 |
+
"proof": {
|
| 120 |
+
"document_type": "Proof of In-Country Status",
|
| 121 |
+
"template_file": "proof_of_in_country_status.json",
|
| 122 |
+
},
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# Logical model aliases for this extractor (OpenAI ChatGPT models).
|
| 127 |
+
ALLOWED_MODELS = [
|
| 128 |
+
"default",
|
| 129 |
+
"gpt-4.1-mini",
|
| 130 |
+
"gpt-4.1",
|
| 131 |
+
"gpt-4o-mini",
|
| 132 |
+
"gpt-4o",
|
| 133 |
+
# Legacy/dated aliases kept for compatibility.
|
| 134 |
+
"gpt-4.1-2025-04-14",
|
| 135 |
+
"gpt-4.1-mini-2025-04-14",
|
| 136 |
+
"gpt-5-2025-08-07",
|
| 137 |
+
"gpt-5-mini-2025-08-07",
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
DEFAULT_MODEL = os.getenv("EXTRACTOR_MODEL_ALIAS", "gpt-4.1-mini")
|
| 141 |
+
|
| 142 |
+
OPENAI_API_KEY_ENV = "OPENAI_API_KEY"
|
| 143 |
+
_openai_client: Optional[OpenAI] = None
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def load_template(template_file: str) -> Dict[str, Any]:
|
| 147 |
+
path = os.path.join(TEMPLATES_DIR, template_file)
|
| 148 |
+
if not os.path.exists(path):
|
| 149 |
+
raise FileNotFoundError(f"Template not found: {path}")
|
| 150 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 151 |
+
return json.load(fh)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def infer_template_from_filename(filename: str) -> Tuple[str, Dict[str, Any]]:
|
| 155 |
+
"""
|
| 156 |
+
Look at the PDF file name and decide which document_type + template to use.
|
| 157 |
+
|
| 158 |
+
Example:
|
| 159 |
+
- 'I129 HALF.pdf' -> matches 'i129' -> uses i129_h1b_petition.json
|
| 160 |
+
- 'passport_rohan.pdf' -> matches 'passport' -> uses passport.json
|
| 161 |
+
- 'F1_visa_page1.pdf' -> matches 'visa' -> uses us_visa.json
|
| 162 |
+
- 'i94_record.pdf' -> matches 'i94' -> uses i_94.json
|
| 163 |
+
"""
|
| 164 |
+
basename = os.path.basename(filename).lower()
|
| 165 |
+
|
| 166 |
+
for keyword, cfg in TEMPLATE_REGISTRY.items():
|
| 167 |
+
if keyword in basename:
|
| 168 |
+
document_type = cfg["document_type"]
|
| 169 |
+
template = load_template(cfg["template_file"])
|
| 170 |
+
return document_type, template
|
| 171 |
+
|
| 172 |
+
# fallback: raise to force user to add mapping or rename file
|
| 173 |
+
raise ValueError(
|
| 174 |
+
f"Could not infer document type from filename '{basename}'. "
|
| 175 |
+
f"Known keywords: {list(TEMPLATE_REGISTRY.keys())}"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def pdf_bytes_to_base64_images(pdf_bytes: bytes, max_pages: int = 10) -> List[str]:
|
| 180 |
+
"""
|
| 181 |
+
Render each page of the PDF bytes to a JPEG image and return a list of
|
| 182 |
+
base64-encoded image strings (no data URL prefix). Limit pages by max_pages.
|
| 183 |
+
"""
|
| 184 |
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
| 185 |
+
images: List[str] = []
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
total_pages = len(pdf)
|
| 189 |
+
if max_pages is not None and max_pages > 0:
|
| 190 |
+
page_count = min(total_pages, max_pages)
|
| 191 |
+
else:
|
| 192 |
+
page_count = total_pages
|
| 193 |
+
|
| 194 |
+
# Adaptive scale/quality to keep payloads manageable.
|
| 195 |
+
if page_count <= 2:
|
| 196 |
+
scale = 4.17 # ~300 DPI
|
| 197 |
+
quality = 80
|
| 198 |
+
elif page_count <= 10:
|
| 199 |
+
scale = 2.0 # ~145 DPI
|
| 200 |
+
quality = 60
|
| 201 |
+
else:
|
| 202 |
+
scale = 1.5 # ~110 DPI
|
| 203 |
+
quality = 60
|
| 204 |
+
|
| 205 |
+
for page_index in range(page_count):
|
| 206 |
+
page = pdf[page_index]
|
| 207 |
+
pil_image = page.render(scale=scale).to_pil()
|
| 208 |
+
|
| 209 |
+
buffered = io.BytesIO()
|
| 210 |
+
pil_image.save(buffered, format="JPEG", quality=quality)
|
| 211 |
+
img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 212 |
+
images.append(img_b64)
|
| 213 |
+
|
| 214 |
+
buffered.close()
|
| 215 |
+
pil_image.close()
|
| 216 |
+
finally:
|
| 217 |
+
pdf.close()
|
| 218 |
+
|
| 219 |
+
return images
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def build_extraction_prompt(document_type: str, template: Dict[str, Any]) -> str:
|
| 223 |
+
"""
|
| 224 |
+
Build a prompt that instructs the model to extract data into the
|
| 225 |
+
exact JSON structure defined by the template.
|
| 226 |
+
"""
|
| 227 |
+
return f"""
|
| 228 |
+
You are a document data extraction system.
|
| 229 |
+
|
| 230 |
+
Document Type: {document_type}
|
| 231 |
+
|
| 232 |
+
Extract all information from the provided document image(s) and return it in the following exact JSON structure:
|
| 233 |
+
|
| 234 |
+
{json.dumps(template, indent=2)}
|
| 235 |
+
|
| 236 |
+
Instructions:
|
| 237 |
+
- Output only valid JSON matching exactly the structure above
|
| 238 |
+
- Do NOT add explanations
|
| 239 |
+
- Do NOT wrap the JSON in markdown, backticks, or code fences
|
| 240 |
+
- If a field is missing, set it to ""
|
| 241 |
+
- Use the exact field names; do not modify the structure
|
| 242 |
+
- Extract information from ALL pages
|
| 243 |
+
"""
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _get_openai_client() -> OpenAI:
|
| 247 |
+
global _openai_client
|
| 248 |
+
if _openai_client is None:
|
| 249 |
+
api_key = os.getenv(OPENAI_API_KEY_ENV)
|
| 250 |
+
if not api_key:
|
| 251 |
+
raise RuntimeError(
|
| 252 |
+
f"{OPENAI_API_KEY_ENV} is not set. "
|
| 253 |
+
"Set it in your environment or CI secrets."
|
| 254 |
+
)
|
| 255 |
+
_openai_client = OpenAI(api_key=api_key)
|
| 256 |
+
return _openai_client
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _extract_text_from_response(response: Any) -> str:
|
| 260 |
+
output_text = getattr(response, "output_text", None)
|
| 261 |
+
if isinstance(output_text, str) and output_text.strip():
|
| 262 |
+
return output_text.strip()
|
| 263 |
+
|
| 264 |
+
output = getattr(response, "output", None)
|
| 265 |
+
if isinstance(output, list):
|
| 266 |
+
parts: List[str] = []
|
| 267 |
+
for item in output:
|
| 268 |
+
content = getattr(item, "content", None)
|
| 269 |
+
if content is None and isinstance(item, dict):
|
| 270 |
+
content = item.get("content")
|
| 271 |
+
if isinstance(content, list):
|
| 272 |
+
for block in content:
|
| 273 |
+
if isinstance(block, dict):
|
| 274 |
+
block_type = block.get("type")
|
| 275 |
+
if block_type in ("output_text", "text"):
|
| 276 |
+
parts.append(block.get("text", ""))
|
| 277 |
+
else:
|
| 278 |
+
block_type = getattr(block, "type", None)
|
| 279 |
+
if block_type in ("output_text", "text"):
|
| 280 |
+
parts.append(getattr(block, "text", ""))
|
| 281 |
+
return "".join(parts).strip()
|
| 282 |
+
|
| 283 |
+
return ""
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def _invoke_openai(prompt: str, images: List[str], model: str) -> Any:
|
| 287 |
+
"""
|
| 288 |
+
Call OpenAI ChatGPT with the given prompt + images and return the response.
|
| 289 |
+
"""
|
| 290 |
+
client = _get_openai_client()
|
| 291 |
+
|
| 292 |
+
user_content: List[Dict[str, Any]] = [
|
| 293 |
+
{"type": "input_text", "text": prompt},
|
| 294 |
+
]
|
| 295 |
+
|
| 296 |
+
for img_b64 in images:
|
| 297 |
+
user_content.append(
|
| 298 |
+
{
|
| 299 |
+
"type": "input_image",
|
| 300 |
+
"image_url": f"data:image/jpeg;base64,{img_b64}",
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
return client.responses.create(
|
| 305 |
+
model=model,
|
| 306 |
+
temperature=0,
|
| 307 |
+
input=[
|
| 308 |
+
{
|
| 309 |
+
"role": "system",
|
| 310 |
+
"content": [
|
| 311 |
+
{
|
| 312 |
+
"type": "input_text",
|
| 313 |
+
"text": "You are a precise document extraction engine.",
|
| 314 |
+
}
|
| 315 |
+
],
|
| 316 |
+
},
|
| 317 |
+
{
|
| 318 |
+
"role": "user",
|
| 319 |
+
"content": user_content,
|
| 320 |
+
},
|
| 321 |
+
],
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def call_openai_extract(
|
| 326 |
+
document_type: str,
|
| 327 |
+
template: Dict[str, Any],
|
| 328 |
+
images: List[str],
|
| 329 |
+
model: str = DEFAULT_MODEL,
|
| 330 |
+
) -> Dict[str, Any]:
|
| 331 |
+
"""
|
| 332 |
+
Call OpenAI ChatGPT to extract structured JSON for the given
|
| 333 |
+
document type and template.
|
| 334 |
+
"""
|
| 335 |
+
resolved_model = DEFAULT_MODEL if model == "default" else model
|
| 336 |
+
|
| 337 |
+
if resolved_model not in ALLOWED_MODELS:
|
| 338 |
+
raise ValueError(
|
| 339 |
+
f"Unsupported model alias '{model}'. "
|
| 340 |
+
f"Supported values: {ALLOWED_MODELS}. "
|
| 341 |
+
"This extractor uses OpenAI ChatGPT models."
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
prompt = build_extraction_prompt(document_type, template)
|
| 345 |
+
|
| 346 |
+
response = _invoke_openai(prompt, images, resolved_model)
|
| 347 |
+
json_str = _extract_text_from_response(response).strip()
|
| 348 |
+
|
| 349 |
+
# Strip optional markdown fences (```json ... ```)
|
| 350 |
+
if json_str.startswith("```"):
|
| 351 |
+
lines = json_str.splitlines()
|
| 352 |
+
if lines and lines[0].lstrip().startswith("```"):
|
| 353 |
+
lines = lines[1:]
|
| 354 |
+
if lines and lines[-1].strip().startswith("```"):
|
| 355 |
+
lines = lines[:-1]
|
| 356 |
+
json_str = "\n".join(lines).strip()
|
| 357 |
+
|
| 358 |
+
if not json_str:
|
| 359 |
+
raise ValueError(
|
| 360 |
+
"Model response did not contain any text content to parse as JSON."
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
try:
|
| 364 |
+
return json.loads(json_str)
|
| 365 |
+
except json.JSONDecodeError as exc:
|
| 366 |
+
snippet = json_str[:500]
|
| 367 |
+
raise ValueError(
|
| 368 |
+
f"Model output was not valid JSON: {exc}. "
|
| 369 |
+
f"First 500 characters of response: {snippet!r}"
|
| 370 |
+
) from exc
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def extract_using_openai_from_pdf_bytes(
|
| 374 |
+
pdf_bytes: bytes,
|
| 375 |
+
filename: str,
|
| 376 |
+
max_pages: int = 10,
|
| 377 |
+
model: str = DEFAULT_MODEL,
|
| 378 |
+
) -> Dict[str, Any]:
|
| 379 |
+
"""
|
| 380 |
+
Backwards-compatible entrypoint used by the Vision Lambda.
|
| 381 |
+
|
| 382 |
+
Despite the legacy name, this now uses OpenAI ChatGPT to perform the
|
| 383 |
+
extraction while preserving the JSON contract.
|
| 384 |
+
"""
|
| 385 |
+
document_type, template = infer_template_from_filename(filename)
|
| 386 |
+
images = pdf_bytes_to_base64_images(pdf_bytes, max_pages=max_pages)
|
| 387 |
+
if not images:
|
| 388 |
+
raise RuntimeError("No images were extracted from PDF")
|
| 389 |
+
|
| 390 |
+
return call_openai_extract(document_type, template, images, model=model)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def _prompt_for_pdf_path() -> str:
|
| 394 |
+
"""
|
| 395 |
+
Simple CLI helper for local runs. Web UI integrations can call
|
| 396 |
+
extract_using_openai_from_pdf_bytes directly instead.
|
| 397 |
+
"""
|
| 398 |
+
path = input("Enter path to PDF: ").strip()
|
| 399 |
+
if not path:
|
| 400 |
+
raise SystemExit("No PDF path provided.")
|
| 401 |
+
return path
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
if __name__ == "__main__":
|
| 405 |
+
pdf_path = _prompt_for_pdf_path()
|
| 406 |
+
with open(pdf_path, "rb") as fh:
|
| 407 |
+
pdf_data = fh.read()
|
| 408 |
+
result = extract_using_openai_from_pdf_bytes(pdf_data, pdf_path)
|
| 409 |
+
print(json.dumps(result, ensure_ascii=False))
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair
|
| 2 |
+
huggingface_hub
|
| 3 |
+
openai
|
| 4 |
+
pandas
|
| 5 |
+
pillow
|
| 6 |
+
pypdfium2
|
| 7 |
+
streamlit==1.29.0
|
src/streamlit_app.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import pypdfium2 as pdfium
|
| 8 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 9 |
+
|
| 10 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 11 |
+
if ROOT_DIR not in sys.path:
|
| 12 |
+
sys.path.insert(0, ROOT_DIR)
|
| 13 |
+
|
| 14 |
+
from extractor import extract_using_openai_from_pdf_bytes, TEMPLATE_REGISTRY
|
| 15 |
+
|
| 16 |
+
SAMPLE_DATASET_REPO = os.getenv(
|
| 17 |
+
"SAMPLE_DATASET_REPO",
|
| 18 |
+
"pradyten/pdf-extractor-samples",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
st.set_page_config(page_title="PDF Extractor", layout="wide")
|
| 23 |
+
|
| 24 |
+
st.markdown(
|
| 25 |
+
"""
|
| 26 |
+
<style>
|
| 27 |
+
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;700&family=Plus+Jakarta+Sans:wght@400;500;600&display=swap');
|
| 28 |
+
:root {
|
| 29 |
+
--bg-0: #f3ede4;
|
| 30 |
+
--bg-1: #fbf5ea;
|
| 31 |
+
--panel: #ffffff;
|
| 32 |
+
--border: rgba(16, 24, 40, 0.12);
|
| 33 |
+
--text: #121212;
|
| 34 |
+
--muted: #5b616b;
|
| 35 |
+
--accent: #d4552d;
|
| 36 |
+
--accent-dark: #b44725;
|
| 37 |
+
--shadow: 0 18px 50px rgba(20, 20, 20, 0.12);
|
| 38 |
+
}
|
| 39 |
+
html, body, [data-testid="stAppViewContainer"] {
|
| 40 |
+
background: radial-gradient(1200px 600px at 10% -10%, var(--bg-0) 0%, #f7f2e9 45%, var(--bg-1) 100%);
|
| 41 |
+
color: var(--text);
|
| 42 |
+
font-family: "Plus Jakarta Sans", system-ui, -apple-system, "Segoe UI", sans-serif;
|
| 43 |
+
}
|
| 44 |
+
h1, h2, h3, h4, h5 {
|
| 45 |
+
font-family: "Space Grotesk", system-ui, -apple-system, "Segoe UI", sans-serif;
|
| 46 |
+
letter-spacing: -0.02em;
|
| 47 |
+
}
|
| 48 |
+
.main .block-container {
|
| 49 |
+
max-width: 1200px;
|
| 50 |
+
padding-top: 2.5rem;
|
| 51 |
+
padding-bottom: 3rem;
|
| 52 |
+
}
|
| 53 |
+
div[data-testid="column"] > div {
|
| 54 |
+
background: var(--panel);
|
| 55 |
+
border: 1px solid var(--border);
|
| 56 |
+
border-radius: 18px;
|
| 57 |
+
padding: 1.25rem 1.5rem 1.5rem 1.5rem;
|
| 58 |
+
box-shadow: var(--shadow);
|
| 59 |
+
}
|
| 60 |
+
.stButton > button {
|
| 61 |
+
background: var(--accent);
|
| 62 |
+
color: #ffffff;
|
| 63 |
+
border: none;
|
| 64 |
+
border-radius: 999px;
|
| 65 |
+
padding: 0.65rem 1.4rem;
|
| 66 |
+
font-weight: 600;
|
| 67 |
+
}
|
| 68 |
+
.stButton > button:hover {
|
| 69 |
+
background: var(--accent-dark);
|
| 70 |
+
color: #ffffff;
|
| 71 |
+
}
|
| 72 |
+
div[data-testid="stFileUploader"] {
|
| 73 |
+
border: 1px dashed rgba(16, 24, 40, 0.18);
|
| 74 |
+
border-radius: 14px;
|
| 75 |
+
padding: 0.6rem;
|
| 76 |
+
background: rgba(248, 244, 236, 0.6);
|
| 77 |
+
}
|
| 78 |
+
.stAlert {
|
| 79 |
+
border-radius: 12px;
|
| 80 |
+
}
|
| 81 |
+
pre, code, .stCodeBlock {
|
| 82 |
+
border-radius: 12px !important;
|
| 83 |
+
}
|
| 84 |
+
#MainMenu, footer {
|
| 85 |
+
visibility: hidden;
|
| 86 |
+
}
|
| 87 |
+
</style>
|
| 88 |
+
""",
|
| 89 |
+
unsafe_allow_html=True,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _render_pdf_preview(pdf_bytes: bytes) -> None:
|
| 94 |
+
pdf = None
|
| 95 |
+
try:
|
| 96 |
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
| 97 |
+
if len(pdf) < 1:
|
| 98 |
+
st.info("No pages found in this PDF.")
|
| 99 |
+
return
|
| 100 |
+
page = pdf[0]
|
| 101 |
+
pil_image = page.render(scale=2.0).to_pil()
|
| 102 |
+
st.image(pil_image, caption="Preview (page 1)", use_column_width=True)
|
| 103 |
+
except Exception as exc: # pragma: no cover - UI preview path
|
| 104 |
+
st.warning(f"Preview unavailable: {exc}")
|
| 105 |
+
finally:
|
| 106 |
+
if pdf is not None:
|
| 107 |
+
pdf.close()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _load_pdf_state(uploaded_file) -> tuple[bytes, str, str]:
|
| 111 |
+
pdf_bytes = uploaded_file.getvalue()
|
| 112 |
+
digest = hashlib.sha256(pdf_bytes).hexdigest()
|
| 113 |
+
return pdf_bytes, uploaded_file.name, digest
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@st.cache_data(show_spinner=False)
|
| 117 |
+
def _list_sample_pdfs(repo_id: str) -> list[str]:
|
| 118 |
+
api = HfApi()
|
| 119 |
+
try:
|
| 120 |
+
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 121 |
+
except Exception:
|
| 122 |
+
return []
|
| 123 |
+
return sorted(name for name in files if name.lower().endswith(".pdf"))
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@st.cache_data(show_spinner=False)
|
| 127 |
+
def _load_sample_state(repo_id: str, filename: str) -> tuple[bytes, str, str]:
|
| 128 |
+
path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
|
| 129 |
+
with open(path, "rb") as fh:
|
| 130 |
+
pdf_bytes = fh.read()
|
| 131 |
+
digest = hashlib.sha256(pdf_bytes).hexdigest()
|
| 132 |
+
return pdf_bytes, filename, digest
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _build_download_name(filename: str) -> str:
|
| 136 |
+
base = os.path.splitext(filename)[0] if filename else "extraction"
|
| 137 |
+
safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in base)
|
| 138 |
+
if not safe:
|
| 139 |
+
safe = "extraction"
|
| 140 |
+
return f"{safe}_extracted.json"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _reset_pdf_state() -> None:
|
| 144 |
+
st.session_state.pdf_bytes = None
|
| 145 |
+
st.session_state.pdf_filename = None
|
| 146 |
+
st.session_state.pdf_digest = None
|
| 147 |
+
st.session_state.extract_result = None
|
| 148 |
+
st.session_state.extract_error = None
|
| 149 |
+
st.session_state.extract_digest = None
|
| 150 |
+
st.session_state.extract_filename = None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _supported_doc_types() -> list[str]:
|
| 154 |
+
seen = []
|
| 155 |
+
for cfg in TEMPLATE_REGISTRY.values():
|
| 156 |
+
doc_type = cfg.get("document_type")
|
| 157 |
+
if doc_type and doc_type not in seen:
|
| 158 |
+
seen.append(doc_type)
|
| 159 |
+
return seen
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if "extract_result" not in st.session_state:
|
| 163 |
+
st.session_state.extract_result = None
|
| 164 |
+
if "extract_error" not in st.session_state:
|
| 165 |
+
st.session_state.extract_error = None
|
| 166 |
+
if "extract_digest" not in st.session_state:
|
| 167 |
+
st.session_state.extract_digest = None
|
| 168 |
+
if "extract_filename" not in st.session_state:
|
| 169 |
+
st.session_state.extract_filename = None
|
| 170 |
+
if "pdf_bytes" not in st.session_state:
|
| 171 |
+
st.session_state.pdf_bytes = None
|
| 172 |
+
if "pdf_filename" not in st.session_state:
|
| 173 |
+
st.session_state.pdf_filename = None
|
| 174 |
+
if "pdf_digest" not in st.session_state:
|
| 175 |
+
st.session_state.pdf_digest = None
|
| 176 |
+
if "input_mode_prev" not in st.session_state:
|
| 177 |
+
st.session_state.input_mode_prev = None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
st.markdown("## PDF Extractor")
|
| 181 |
+
st.markdown(
|
| 182 |
+
"Choose a sample or upload your own PDF, preview it, then click Extract "
|
| 183 |
+
"to generate structured JSON on the right."
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
left, right = st.columns([1, 1], gap="large")
|
| 187 |
+
|
| 188 |
+
with left:
|
| 189 |
+
st.markdown("### Upload + Preview")
|
| 190 |
+
input_mode = st.radio(
|
| 191 |
+
"Input source",
|
| 192 |
+
["Upload PDF", "Use sample"],
|
| 193 |
+
horizontal=True,
|
| 194 |
+
label_visibility="collapsed",
|
| 195 |
+
key="input_mode",
|
| 196 |
+
)
|
| 197 |
+
if st.session_state.input_mode_prev != input_mode:
|
| 198 |
+
_reset_pdf_state()
|
| 199 |
+
st.session_state.input_mode_prev = input_mode
|
| 200 |
+
|
| 201 |
+
selected_sample = None
|
| 202 |
+
uploaded_file = None
|
| 203 |
+
|
| 204 |
+
if input_mode == "Use sample":
|
| 205 |
+
sample_files = _list_sample_pdfs(SAMPLE_DATASET_REPO)
|
| 206 |
+
if not sample_files:
|
| 207 |
+
st.info("No sample PDFs found in the sample dataset yet.")
|
| 208 |
+
_reset_pdf_state()
|
| 209 |
+
sample_options = ["Choose a sample..."] + sample_files
|
| 210 |
+
sample_choice = st.selectbox(
|
| 211 |
+
"Choose a sample",
|
| 212 |
+
sample_options,
|
| 213 |
+
label_visibility="collapsed",
|
| 214 |
+
key="sample_choice",
|
| 215 |
+
)
|
| 216 |
+
selected_sample = sample_choice if sample_choice in sample_files else None
|
| 217 |
+
if selected_sample is None:
|
| 218 |
+
_reset_pdf_state()
|
| 219 |
+
else:
|
| 220 |
+
uploaded_file = st.file_uploader(
|
| 221 |
+
"Upload a PDF",
|
| 222 |
+
type=["pdf"],
|
| 223 |
+
accept_multiple_files=False,
|
| 224 |
+
label_visibility="collapsed",
|
| 225 |
+
key="pdf_uploader",
|
| 226 |
+
help="File name should include a known keyword (for example: resume, passport, i129).",
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
if input_mode == "Use sample" and selected_sample:
|
| 230 |
+
try:
|
| 231 |
+
pdf_bytes, filename, digest = _load_sample_state(
|
| 232 |
+
SAMPLE_DATASET_REPO,
|
| 233 |
+
selected_sample,
|
| 234 |
+
)
|
| 235 |
+
except Exception as exc: # pragma: no cover - sample load path
|
| 236 |
+
st.error(f"Sample load failed: {exc}")
|
| 237 |
+
else:
|
| 238 |
+
if st.session_state.pdf_digest != digest:
|
| 239 |
+
st.session_state.pdf_bytes = pdf_bytes
|
| 240 |
+
st.session_state.pdf_filename = filename
|
| 241 |
+
st.session_state.pdf_digest = digest
|
| 242 |
+
st.session_state.extract_result = None
|
| 243 |
+
st.session_state.extract_error = None
|
| 244 |
+
st.session_state.extract_digest = digest
|
| 245 |
+
st.session_state.extract_filename = filename
|
| 246 |
+
|
| 247 |
+
st.markdown(f"**Sample:** `{st.session_state.pdf_filename}`")
|
| 248 |
+
_render_pdf_preview(st.session_state.pdf_bytes)
|
| 249 |
+
elif input_mode == "Upload PDF" and uploaded_file is not None:
|
| 250 |
+
pdf_bytes, filename, digest = _load_pdf_state(uploaded_file)
|
| 251 |
+
if st.session_state.pdf_digest != digest:
|
| 252 |
+
st.session_state.pdf_bytes = pdf_bytes
|
| 253 |
+
st.session_state.pdf_filename = filename
|
| 254 |
+
st.session_state.pdf_digest = digest
|
| 255 |
+
st.session_state.extract_result = None
|
| 256 |
+
st.session_state.extract_error = None
|
| 257 |
+
st.session_state.extract_digest = digest
|
| 258 |
+
st.session_state.extract_filename = filename
|
| 259 |
+
|
| 260 |
+
st.markdown(f"**File:** `{st.session_state.pdf_filename}`")
|
| 261 |
+
_render_pdf_preview(st.session_state.pdf_bytes)
|
| 262 |
+
else:
|
| 263 |
+
st.info("Upload a PDF or choose a sample to preview it here.")
|
| 264 |
+
|
| 265 |
+
st.markdown("#### Notes")
|
| 266 |
+
st.caption(
|
| 267 |
+
"Template selection is inferred from the filename. If extraction fails, "
|
| 268 |
+
"rename the file to include a supported keyword (for example: "
|
| 269 |
+
"`resume.pdf`, `passport_jane.pdf`, `i129_petition.pdf`)."
|
| 270 |
+
)
|
| 271 |
+
st.caption(f"Sample dataset: `{SAMPLE_DATASET_REPO}`")
|
| 272 |
+
st.markdown("#### Supported documents")
|
| 273 |
+
st.markdown("\n".join(f"- {doc}" for doc in _supported_doc_types()))
|
| 274 |
+
|
| 275 |
+
with right:
|
| 276 |
+
st.markdown("### Extract")
|
| 277 |
+
model_choice = st.selectbox(
|
| 278 |
+
"Model",
|
| 279 |
+
["default", "gpt-4.1-mini", "gpt-4.1", "gpt-4o-mini", "gpt-4o"],
|
| 280 |
+
index=1,
|
| 281 |
+
help="Choose a model or use default (EXTRACTOR_MODEL_ALIAS).",
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
has_api_key = bool(os.getenv("OPENAI_API_KEY"))
|
| 285 |
+
if not has_api_key:
|
| 286 |
+
st.warning("OPENAI_API_KEY is not set. Add it to your environment or Space secrets.")
|
| 287 |
+
|
| 288 |
+
extract_clicked = st.button(
|
| 289 |
+
"Extract",
|
| 290 |
+
use_container_width=False,
|
| 291 |
+
disabled=st.session_state.pdf_bytes is None or not has_api_key,
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
if extract_clicked:
|
| 295 |
+
with st.spinner("Extracting structured JSON..."):
|
| 296 |
+
try:
|
| 297 |
+
result = extract_using_openai_from_pdf_bytes(
|
| 298 |
+
st.session_state.pdf_bytes,
|
| 299 |
+
st.session_state.pdf_filename,
|
| 300 |
+
model=model_choice,
|
| 301 |
+
)
|
| 302 |
+
st.session_state.extract_result = result
|
| 303 |
+
st.session_state.extract_error = None
|
| 304 |
+
except Exception as exc: # pragma: no cover - runtime error path
|
| 305 |
+
message = str(exc)
|
| 306 |
+
if "403" in message or "PermissionDenied" in message:
|
| 307 |
+
message = (
|
| 308 |
+
"OpenAI request was rejected (403). "
|
| 309 |
+
"Check OPENAI_API_KEY, model access, and billing."
|
| 310 |
+
)
|
| 311 |
+
st.session_state.extract_error = message
|
| 312 |
+
st.session_state.extract_result = None
|
| 313 |
+
|
| 314 |
+
if st.session_state.extract_error:
|
| 315 |
+
st.error(st.session_state.extract_error)
|
| 316 |
+
|
| 317 |
+
if st.session_state.extract_result is None:
|
| 318 |
+
st.info("Extraction output will appear here.")
|
| 319 |
+
else:
|
| 320 |
+
st.markdown("#### JSON Output")
|
| 321 |
+
json_text = json.dumps(
|
| 322 |
+
st.session_state.extract_result,
|
| 323 |
+
indent=2,
|
| 324 |
+
ensure_ascii=False,
|
| 325 |
+
)
|
| 326 |
+
st.code(json_text, language="json")
|
| 327 |
+
st.download_button(
|
| 328 |
+
"Download JSON",
|
| 329 |
+
data=json_text,
|
| 330 |
+
file_name=_build_download_name(st.session_state.pdf_filename or ""),
|
| 331 |
+
mime="application/json",
|
| 332 |
+
)
|
templates/corporate_tax_returns.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"legal_business_name": "",
|
| 3 |
+
"trade_name_dba": "",
|
| 4 |
+
"contact_person": {
|
| 5 |
+
"first_name": "",
|
| 6 |
+
"last_name": "",
|
| 7 |
+
"title": ""
|
| 8 |
+
},
|
| 9 |
+
"business_address": {
|
| 10 |
+
"street": "",
|
| 11 |
+
"apt_ste_flr": "",
|
| 12 |
+
"city": "",
|
| 13 |
+
"state": "",
|
| 14 |
+
"zip_code": ""
|
| 15 |
+
},
|
| 16 |
+
"telephone_number": "",
|
| 17 |
+
"naics_code": "",
|
| 18 |
+
"type_of_business": "",
|
| 19 |
+
"federal_employer_identification_number": "",
|
| 20 |
+
"gross_annual_income": "",
|
| 21 |
+
"net_annual_income": "",
|
| 22 |
+
"company_signatories": [
|
| 23 |
+
{
|
| 24 |
+
"name": "",
|
| 25 |
+
"title": "",
|
| 26 |
+
"signature_date (MM/DD/YY)": ""
|
| 27 |
+
}
|
| 28 |
+
],
|
| 29 |
+
"tax_year": "",
|
| 30 |
+
"form_type": ""
|
| 31 |
+
}
|
templates/diplomas.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"diploma": {
|
| 3 |
+
"institution_information": {
|
| 4 |
+
"institution_name": "",
|
| 5 |
+
"school_or_college_name": "",
|
| 6 |
+
"campus": "",
|
| 7 |
+
"address": {
|
| 8 |
+
"street": "",
|
| 9 |
+
"apt_ste_flr": "",
|
| 10 |
+
"city": "",
|
| 11 |
+
"state": "",
|
| 12 |
+
"zip_code": "",
|
| 13 |
+
"country": ""
|
| 14 |
+
},
|
| 15 |
+
"telephone_number": "",
|
| 16 |
+
"email": "",
|
| 17 |
+
"website": ""
|
| 18 |
+
},
|
| 19 |
+
"student_information": {
|
| 20 |
+
"full_name": "",
|
| 21 |
+
"first_name": "",
|
| 22 |
+
"last_name": "",
|
| 23 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 24 |
+
"student_id": ""
|
| 25 |
+
},
|
| 26 |
+
"diploma_details": {
|
| 27 |
+
"has_signature": "",
|
| 28 |
+
"has_official_seal": "",
|
| 29 |
+
"issue_date (MM/DD/YY)": "",
|
| 30 |
+
"degree_type": "",
|
| 31 |
+
"major": "",
|
| 32 |
+
"minor": "",
|
| 33 |
+
"concentration": "",
|
| 34 |
+
"honors": "",
|
| 35 |
+
"program_length": ""
|
| 36 |
+
},
|
| 37 |
+
"signatories": [
|
| 38 |
+
{
|
| 39 |
+
"name": "",
|
| 40 |
+
"title": "",
|
| 41 |
+
"signature_date (MM/DD/YY)": ""
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"document_metadata": {
|
| 45 |
+
"diploma_number": "",
|
| 46 |
+
"serial_number": "",
|
| 47 |
+
"language": "",
|
| 48 |
+
"format": ""
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
}
|
templates/employment_letter.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"employment_letter": {
|
| 3 |
+
"letter_type": "",
|
| 4 |
+
"letter_date (MM/DD/YY)": "",
|
| 5 |
+
"employee_information": {
|
| 6 |
+
"full_name": "",
|
| 7 |
+
"first_name": "",
|
| 8 |
+
"last_name": "",
|
| 9 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 10 |
+
"employee_id": "",
|
| 11 |
+
"job_title": "",
|
| 12 |
+
"department": ""
|
| 13 |
+
},
|
| 14 |
+
"employment_details": {
|
| 15 |
+
"employment_status": "",
|
| 16 |
+
"employment_start_date (MM/DD/YY)": "",
|
| 17 |
+
"employment_end_date (MM/DD/YY)": "",
|
| 18 |
+
"is_currently_employed": "",
|
| 19 |
+
"work_schedule": "",
|
| 20 |
+
"hours_per_week": "",
|
| 21 |
+
"full_time (yes/no)": "",
|
| 22 |
+
"salary": "",
|
| 23 |
+
"salary_frequency (year/month/etc.)": "",
|
| 24 |
+
"bonus_or_variable_pay": "",
|
| 25 |
+
"job_duties_summary": ""
|
| 26 |
+
},
|
| 27 |
+
"employer_information": {
|
| 28 |
+
"legal_business_name": "",
|
| 29 |
+
"trade_name_dba": "",
|
| 30 |
+
"business_address": {
|
| 31 |
+
"street": "",
|
| 32 |
+
"apt_ste_flr": "",
|
| 33 |
+
"city": "",
|
| 34 |
+
"state": "",
|
| 35 |
+
"zip_code": "",
|
| 36 |
+
"country": ""
|
| 37 |
+
},
|
| 38 |
+
"telephone_number": "",
|
| 39 |
+
"email": "",
|
| 40 |
+
"website": ""
|
| 41 |
+
},
|
| 42 |
+
"supervisor_or_hr_contact": {
|
| 43 |
+
"name": "",
|
| 44 |
+
"title": "",
|
| 45 |
+
"phone_number": "",
|
| 46 |
+
"email": ""
|
| 47 |
+
},
|
| 48 |
+
"work_location": [
|
| 49 |
+
{
|
| 50 |
+
"street": "",
|
| 51 |
+
"apt_ste_flr": "",
|
| 52 |
+
"city": "",
|
| 53 |
+
"state": "",
|
| 54 |
+
"zip_code": "",
|
| 55 |
+
"is_third_party_location": "",
|
| 56 |
+
"third_party_name": ""
|
| 57 |
+
}
|
| 58 |
+
],
|
| 59 |
+
"has_signature": "",
|
| 60 |
+
"signatory": {
|
| 61 |
+
"name": "",
|
| 62 |
+
"title": "",
|
| 63 |
+
"signature_date (MM/DD/YY)": ""
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
}
|
templates/i129_h1b_petition.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"i129_h1b_petition": {
|
| 3 |
+
"has_signature": "",
|
| 4 |
+
"form_edition": "",
|
| 5 |
+
"petition_type": "H-1B",
|
| 6 |
+
|
| 7 |
+
"petitioner_information": {
|
| 8 |
+
"petitioner_is_individual": "",
|
| 9 |
+
"individual_petitioner": {
|
| 10 |
+
"family_name": "",
|
| 11 |
+
"given_name": "",
|
| 12 |
+
"middle_name": ""
|
| 13 |
+
},
|
| 14 |
+
"company_information": {
|
| 15 |
+
"company_name": "",
|
| 16 |
+
"fein": "",
|
| 17 |
+
"is_nonprofit_or_government_research_org": "",
|
| 18 |
+
"number_of_employees_in_us": "",
|
| 19 |
+
"year_established": "",
|
| 20 |
+
"type_of_business": "",
|
| 21 |
+
"gross_annual_income": "",
|
| 22 |
+
"net_annual_income": ""
|
| 23 |
+
},
|
| 24 |
+
"mailing_address": {
|
| 25 |
+
"in_care_of": "",
|
| 26 |
+
"street": "",
|
| 27 |
+
"apt_ste_flr": "",
|
| 28 |
+
"city": "",
|
| 29 |
+
"state": "",
|
| 30 |
+
"zip_code": "",
|
| 31 |
+
"province": "",
|
| 32 |
+
"postal_code": "",
|
| 33 |
+
"country": ""
|
| 34 |
+
},
|
| 35 |
+
"contact_information": {
|
| 36 |
+
"daytime_phone": "",
|
| 37 |
+
"mobile_phone": "",
|
| 38 |
+
"email": ""
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
|
| 42 |
+
"petition_information": {
|
| 43 |
+
"requested_classification_symbol": "",
|
| 44 |
+
"basis_for_classification": "",
|
| 45 |
+
"most_recent_receipt_number": "",
|
| 46 |
+
"requested_action": "",
|
| 47 |
+
"total_workers_in_petition": ""
|
| 48 |
+
},
|
| 49 |
+
|
| 50 |
+
"beneficiary_information": {
|
| 51 |
+
"type_of_beneficiary": "",
|
| 52 |
+
"group_name_if_entertainment": "",
|
| 53 |
+
"full_name": {
|
| 54 |
+
"family_name": "",
|
| 55 |
+
"given_name": "",
|
| 56 |
+
"middle_name": ""
|
| 57 |
+
},
|
| 58 |
+
"other_names_used": "",
|
| 59 |
+
"date_of_birth": "",
|
| 60 |
+
"sex": "",
|
| 61 |
+
"country_of_birth": "",
|
| 62 |
+
"province_of_birth": "",
|
| 63 |
+
"country_of_citizenship": "",
|
| 64 |
+
"alien_number": "",
|
| 65 |
+
"ssn": "",
|
| 66 |
+
"itin": "",
|
| 67 |
+
"passport": {
|
| 68 |
+
"number": "",
|
| 69 |
+
"country_of_issuance": "",
|
| 70 |
+
"date_issued": "",
|
| 71 |
+
"date_expires": ""
|
| 72 |
+
},
|
| 73 |
+
"i94_number": "",
|
| 74 |
+
"date_of_last_arrival": "",
|
| 75 |
+
"current_us_address": {
|
| 76 |
+
"street": "",
|
| 77 |
+
"apt_ste_flr": "",
|
| 78 |
+
"city": "",
|
| 79 |
+
"state": "",
|
| 80 |
+
"zip_code": ""
|
| 81 |
+
},
|
| 82 |
+
"current_nonimmigrant_status": "",
|
| 83 |
+
"status_expiration_date": "",
|
| 84 |
+
"sevis_number": "",
|
| 85 |
+
"ead_number": "",
|
| 86 |
+
"foreign_address": {
|
| 87 |
+
"street": "",
|
| 88 |
+
"apt_ste_flr": "",
|
| 89 |
+
"city": "",
|
| 90 |
+
"state_province": "",
|
| 91 |
+
"postal_code": "",
|
| 92 |
+
"country": ""
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
|
| 96 |
+
"processing_information": {
|
| 97 |
+
"consulate_or_poe_to_notify": {
|
| 98 |
+
"type": "",
|
| 99 |
+
"city": "",
|
| 100 |
+
"state_or_country": ""
|
| 101 |
+
},
|
| 102 |
+
"beneficiary_has_valid_passport": "",
|
| 103 |
+
"applications_for_dependents_filed": "",
|
| 104 |
+
"other_petitions_filed": "",
|
| 105 |
+
"applications_for_replacement_i94": "",
|
| 106 |
+
"beneficiary_in_removal_proceedings": "",
|
| 107 |
+
"previous_nonimmigrant_petitions_filed_for_beneficiary": "",
|
| 108 |
+
"previous_immigrant_petitions_filed": "",
|
| 109 |
+
"is_new_petition": "",
|
| 110 |
+
"beneficiary_previously_held_same_classification_last_7_years": "",
|
| 111 |
+
"beneficiary_denied_same_classification_last_7_years": "",
|
| 112 |
+
"beneficiary_j1_or_j2_history": "",
|
| 113 |
+
"j1_j2_status_dates": ""
|
| 114 |
+
},
|
| 115 |
+
|
| 116 |
+
"employment_information": {
|
| 117 |
+
"job_title": "",
|
| 118 |
+
"lca_or_ETA_case_number": "",
|
| 119 |
+
"worksite_addresses": [
|
| 120 |
+
{
|
| 121 |
+
"street": "",
|
| 122 |
+
"apt_ste_flr": "",
|
| 123 |
+
"city": "",
|
| 124 |
+
"state": "",
|
| 125 |
+
"zip_code": "",
|
| 126 |
+
"is_third_party_location": "",
|
| 127 |
+
"third_party_name": ""
|
| 128 |
+
}
|
| 129 |
+
],
|
| 130 |
+
"wages": {
|
| 131 |
+
"amount": "",
|
| 132 |
+
"frequency": ""
|
| 133 |
+
},
|
| 134 |
+
"other_compensation": "",
|
| 135 |
+
"employment_dates": {
|
| 136 |
+
"from": "",
|
| 137 |
+
"to": ""
|
| 138 |
+
},
|
| 139 |
+
"full_time_position": "",
|
| 140 |
+
"hours_per_week": "",
|
| 141 |
+
"will_work_in_cnmi": "",
|
| 142 |
+
"itinerary_included": ""
|
| 143 |
+
},
|
| 144 |
+
|
| 145 |
+
"export_control_certification": {
|
| 146 |
+
"license_not_required": "",
|
| 147 |
+
"license_required": ""
|
| 148 |
+
},
|
| 149 |
+
|
| 150 |
+
"h1b_classification_details": {
|
| 151 |
+
"prior_H_or_L_periods_of_stay": [],
|
| 152 |
+
"confirmation_number_h1b_registration": "",
|
| 153 |
+
"passport_used_for_registration": {
|
| 154 |
+
"number": "",
|
| 155 |
+
"country_of_issuance": "",
|
| 156 |
+
"expiration_date": ""
|
| 157 |
+
},
|
| 158 |
+
"proposed_duties_description": "",
|
| 159 |
+
"beneficiary_present_occupation_and_experience": "",
|
| 160 |
+
"beneficiary_controlling_interest_in_petitioner": "",
|
| 161 |
+
"controlling_interest_explanation": "",
|
| 162 |
+
"requesting_change_of_employer_and_was_previous_CNMI_exempt": "",
|
| 163 |
+
"subject_to_CNMI_cap_exemption": ""
|
| 164 |
+
},
|
| 165 |
+
|
| 166 |
+
"h1b_fee_and_dependency_information": {
|
| 167 |
+
"petitioner_is_h1b_dependent": "",
|
| 168 |
+
"petitioner_is_willful_violator": "",
|
| 169 |
+
"beneficiary_is_exempt_from_attestation": "",
|
| 170 |
+
"exemption_reason_salary_over_60000": "",
|
| 171 |
+
"exemption_reason_master_or_higher_degree": "",
|
| 172 |
+
"petitioner_has_50_or_more_employees": "",
|
| 173 |
+
"more_than_50_percent_in_H1B_L1_status": "",
|
| 174 |
+
"beneficiary_highest_education_level": "",
|
| 175 |
+
"field_of_study": "",
|
| 176 |
+
"rate_of_pay_per_year": "",
|
| 177 |
+
"dot_code": "",
|
| 178 |
+
"naics_code": "",
|
| 179 |
+
"acwia_fee_applicable": "",
|
| 180 |
+
"fraud_fee_applicable": "",
|
| 181 |
+
"public_law_114_113_fee_applicable": ""
|
| 182 |
+
},
|
| 183 |
+
|
| 184 |
+
"numerical_cap_information": {
|
| 185 |
+
"cap_type": "",
|
| 186 |
+
"masters_cap_degree_details": {
|
| 187 |
+
"us_institution_name": "",
|
| 188 |
+
"degree_type": "",
|
| 189 |
+
"date_awarded": "",
|
| 190 |
+
"institution_address": ""
|
| 191 |
+
},
|
| 192 |
+
"reason_for_cap_exemption": ""
|
| 193 |
+
},
|
| 194 |
+
|
| 195 |
+
"offsite_assignment_information": {
|
| 196 |
+
"assigned_offsite": "",
|
| 197 |
+
"complies_with_offsite_h1b_requirements": "",
|
| 198 |
+
"paid_actual_or_prevailing_wage_at_offsite": ""
|
| 199 |
+
},
|
| 200 |
+
|
| 201 |
+
"signatory_information": {
|
| 202 |
+
"authorized_signatory_name": "",
|
| 203 |
+
"title": "",
|
| 204 |
+
"signature_date": "",
|
| 205 |
+
"daytime_phone": "",
|
| 206 |
+
"email": ""
|
| 207 |
+
},
|
| 208 |
+
|
| 209 |
+
"preparer_information": {
|
| 210 |
+
"prepared_by_someone_else": "",
|
| 211 |
+
"preparer_name": "",
|
| 212 |
+
"preparer_business_name": "",
|
| 213 |
+
"preparer_address": "",
|
| 214 |
+
"preparer_contact": {
|
| 215 |
+
"daytime_phone": "",
|
| 216 |
+
"fax": "",
|
| 217 |
+
"email": ""
|
| 218 |
+
},
|
| 219 |
+
"preparer_signature_date": ""
|
| 220 |
+
},
|
| 221 |
+
|
| 222 |
+
"additional_information": ""
|
| 223 |
+
}
|
| 224 |
+
}
|
templates/i_94.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"I-94": {
|
| 3 |
+
"Record_number": "",
|
| 4 |
+
"Recent_date_of_entry (MM/DD/YY)": "",
|
| 5 |
+
"class_of_admission": "",
|
| 6 |
+
"admit_until_date (MM/DD/YY)": "",
|
| 7 |
+
"last_name": "",
|
| 8 |
+
"first_name": "",
|
| 9 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 10 |
+
"passport_number": "",
|
| 11 |
+
"country_of_issue": "",
|
| 12 |
+
"gender (Fullform)": ""
|
| 13 |
+
}
|
| 14 |
+
}
|
templates/marriage_certificate.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"marriage_certificate":
|
| 3 |
+
{
|
| 4 |
+
"certificate_number": "",
|
| 5 |
+
"registration_number": "",
|
| 6 |
+
"issue_date (MM/DD/YY)": "",
|
| 7 |
+
"issuing_authority": "",
|
| 8 |
+
"issuing_jurisdiction": {
|
| 9 |
+
"city": "",
|
| 10 |
+
"county": "",
|
| 11 |
+
"state": "",
|
| 12 |
+
"country": ""
|
| 13 |
+
},
|
| 14 |
+
"marriage_date (MM/DD/YY)": "",
|
| 15 |
+
"marriage_location": {
|
| 16 |
+
"venue_name": "",
|
| 17 |
+
"city": "",
|
| 18 |
+
"county": "",
|
| 19 |
+
"state": "",
|
| 20 |
+
"country": ""
|
| 21 |
+
},
|
| 22 |
+
"Bride": {
|
| 23 |
+
"first_name": "",
|
| 24 |
+
"last_name": "",
|
| 25 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 26 |
+
"place_of_birth": {
|
| 27 |
+
"city": "",
|
| 28 |
+
"state": "",
|
| 29 |
+
"country": ""
|
| 30 |
+
},
|
| 31 |
+
"residence": {
|
| 32 |
+
"city": "",
|
| 33 |
+
"state": "",
|
| 34 |
+
"country": ""
|
| 35 |
+
},
|
| 36 |
+
"father_name": "",
|
| 37 |
+
"mother_name": ""
|
| 38 |
+
},
|
| 39 |
+
"spouse_2": {
|
| 40 |
+
"first_name": "",
|
| 41 |
+
"last_name": "",
|
| 42 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 43 |
+
"place_of_birth": {
|
| 44 |
+
"city": "",
|
| 45 |
+
"state": "",
|
| 46 |
+
"country": ""
|
| 47 |
+
},
|
| 48 |
+
"residence": {
|
| 49 |
+
"city": "",
|
| 50 |
+
"state": "",
|
| 51 |
+
"country": ""
|
| 52 |
+
},
|
| 53 |
+
"father_name": "",
|
| 54 |
+
"mother_name": ""
|
| 55 |
+
},
|
| 56 |
+
"officiant_info": {
|
| 57 |
+
"name": "",
|
| 58 |
+
"title": "",
|
| 59 |
+
"organization": ""
|
| 60 |
+
},
|
| 61 |
+
"witness_1": {
|
| 62 |
+
"name": "",
|
| 63 |
+
"address": ""
|
| 64 |
+
},
|
| 65 |
+
"witness_2": {
|
| 66 |
+
"name": "",
|
| 67 |
+
"address": ""
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
}
|
templates/passport.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"passport": {
|
| 3 |
+
"number": "",
|
| 4 |
+
"type": "",
|
| 5 |
+
"country_code" : "",
|
| 6 |
+
"surname": "",
|
| 7 |
+
"given_name": "",
|
| 8 |
+
"nationality": "",
|
| 9 |
+
"citizenship_country_name": "",
|
| 10 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 11 |
+
"citizenship_number": "",
|
| 12 |
+
"sex (Fullform)": "",
|
| 13 |
+
"province_of_birth": "",
|
| 14 |
+
"country_of_birth": "",
|
| 15 |
+
"issue_date (MM/DD/YY)": "",
|
| 16 |
+
"expiration_date (MM/DD/YY)": "",
|
| 17 |
+
"issuing_country_name": "",
|
| 18 |
+
"fathers_name": ""
|
| 19 |
+
}
|
| 20 |
+
}
|
templates/proof_of_in_country_status.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"I-20":
|
| 2 |
+
{
|
| 3 |
+
"sevis_id": "",
|
| 4 |
+
"student_info": {
|
| 5 |
+
"surname_primary_name": "",
|
| 6 |
+
"given_name": "",
|
| 7 |
+
"preferred_name": "",
|
| 8 |
+
"passport_name": "",
|
| 9 |
+
"country_of_birth": "",
|
| 10 |
+
"country_of_citizenship": "",
|
| 11 |
+
"city_of_birth": "",
|
| 12 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 13 |
+
"class_of_admission": ""
|
| 14 |
+
},
|
| 15 |
+
"school_information": {
|
| 16 |
+
"school_name": "",
|
| 17 |
+
"school_address": "",
|
| 18 |
+
"school_official_to_contact_upon_arrival": "",
|
| 19 |
+
"school_code_and_approval_date (MM/DD/YY)": "",
|
| 20 |
+
"admission_number": ""
|
| 21 |
+
},
|
| 22 |
+
"program_of_study": {
|
| 23 |
+
"education_level": "",
|
| 24 |
+
"major_1": "",
|
| 25 |
+
"major_2": "",
|
| 26 |
+
"program_english_proficiency": "",
|
| 27 |
+
"english_proficiency_notes": "",
|
| 28 |
+
"earliest_admission_date (MM/DD/YY)": "",
|
| 29 |
+
"start_of_classes (MM/DD/YY)": "",
|
| 30 |
+
"program_start_end_date (MM/DD/YY)": "",
|
| 31 |
+
"form_issue_reason": ""
|
| 32 |
+
},
|
| 33 |
+
"financials": {
|
| 34 |
+
"estimated_average_costs_9_months": {
|
| 35 |
+
"tuition_and_fees": "",
|
| 36 |
+
"living_expenses": "",
|
| 37 |
+
"expenses_of_dependents": "",
|
| 38 |
+
"books_health_insurance": "",
|
| 39 |
+
"total": ""
|
| 40 |
+
},
|
| 41 |
+
"students_funding_9_months": {
|
| 42 |
+
"personal_funds": "",
|
| 43 |
+
"funds_from_this_school": "",
|
| 44 |
+
"abroad_family_member": "",
|
| 45 |
+
"on_campus_employment": "",
|
| 46 |
+
"total": ""
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"school_attestation": {
|
| 50 |
+
"signature_of": "",
|
| 51 |
+
"date_issued (MM/DD/YY)": "",
|
| 52 |
+
"place_issued": ""
|
| 53 |
+
},
|
| 54 |
+
"student_attestation": {
|
| 55 |
+
"signature_of": "",
|
| 56 |
+
"date (MM/DD/YY)": ""
|
| 57 |
+
},
|
| 58 |
+
"employment_authorizations": {
|
| 59 |
+
"current_session_start_date (MM/DD/YY)": "",
|
| 60 |
+
"current_session_end_date (MM/DD/YY)": ""
|
| 61 |
+
},
|
| 62 |
+
"travel_endorsement": {
|
| 63 |
+
"designated_school_official": "",
|
| 64 |
+
"title": "",
|
| 65 |
+
"date_issued (MM/DD/YY)": "",
|
| 66 |
+
"place_issued": ""
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
}
|
templates/resume.json
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"resume": {
|
| 3 |
+
"personal_information": {
|
| 4 |
+
"full_name": "",
|
| 5 |
+
"first_name": "",
|
| 6 |
+
"last_name": "",
|
| 7 |
+
"email": "",
|
| 8 |
+
"phone_number": "",
|
| 9 |
+
"linkedin_url": "",
|
| 10 |
+
"github_url": "",
|
| 11 |
+
"website_url": "",
|
| 12 |
+
"address": {
|
| 13 |
+
"street": "",
|
| 14 |
+
"apt_ste_flr": "",
|
| 15 |
+
"city": "",
|
| 16 |
+
"state": "",
|
| 17 |
+
"zip_code": "",
|
| 18 |
+
"country": ""
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"professional_title": "",
|
| 22 |
+
"summary": "",
|
| 23 |
+
"work_experience": [
|
| 24 |
+
{
|
| 25 |
+
"job_title": "",
|
| 26 |
+
"employer_name": "",
|
| 27 |
+
"employment_type": "",
|
| 28 |
+
"location_city": "",
|
| 29 |
+
"location_state": "",
|
| 30 |
+
"location_country": "",
|
| 31 |
+
"start_date (MM/DD/YY)": "",
|
| 32 |
+
"end_date (MM/DD/YY)": "",
|
| 33 |
+
"is_current_role": "",
|
| 34 |
+
"description": "",
|
| 35 |
+
"achievements": [
|
| 36 |
+
""
|
| 37 |
+
],
|
| 38 |
+
"technologies_used": [
|
| 39 |
+
""
|
| 40 |
+
]
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"education": [
|
| 44 |
+
{
|
| 45 |
+
"institution_name": "",
|
| 46 |
+
"degree": "",
|
| 47 |
+
"field_of_study": "",
|
| 48 |
+
"start_date (MM/DD/YY)": "",
|
| 49 |
+
"end_date (MM/DD/YY)": "",
|
| 50 |
+
"is_current_program": "",
|
| 51 |
+
"gpa": "",
|
| 52 |
+
"location_city": "",
|
| 53 |
+
"location_state": "",
|
| 54 |
+
"location_country": ""
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"skills": {
|
| 58 |
+
"technical_skills": [
|
| 59 |
+
""
|
| 60 |
+
],
|
| 61 |
+
"soft_skills": [
|
| 62 |
+
""
|
| 63 |
+
],
|
| 64 |
+
"tools_and_technologies": [
|
| 65 |
+
""
|
| 66 |
+
],
|
| 67 |
+
"languages": [
|
| 68 |
+
{
|
| 69 |
+
"language": "",
|
| 70 |
+
"proficiency": ""
|
| 71 |
+
}
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
"certifications": [
|
| 75 |
+
{
|
| 76 |
+
"name": "",
|
| 77 |
+
"issuer": "",
|
| 78 |
+
"issue_date (MM/DD/YY)": "",
|
| 79 |
+
"expiration_date (MM/DD/YY)": "",
|
| 80 |
+
"credential_id": "",
|
| 81 |
+
"credential_url": ""
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"projects": [
|
| 85 |
+
{
|
| 86 |
+
"name": "",
|
| 87 |
+
"role": "",
|
| 88 |
+
"start_date (MM/DD/YY)": "",
|
| 89 |
+
"end_date (MM/DD/YY)": "",
|
| 90 |
+
"is_current_project": "",
|
| 91 |
+
"description": "",
|
| 92 |
+
"responsibilities": [
|
| 93 |
+
""
|
| 94 |
+
],
|
| 95 |
+
"technologies_used": [
|
| 96 |
+
""
|
| 97 |
+
],
|
| 98 |
+
"project_url": ""
|
| 99 |
+
}
|
| 100 |
+
],
|
| 101 |
+
"publications": [
|
| 102 |
+
{
|
| 103 |
+
"title": "",
|
| 104 |
+
"venue": "",
|
| 105 |
+
"publication_date (MM/DD/YY)": "",
|
| 106 |
+
"authors": [
|
| 107 |
+
""
|
| 108 |
+
],
|
| 109 |
+
"doi": "",
|
| 110 |
+
"url": ""
|
| 111 |
+
}
|
| 112 |
+
],
|
| 113 |
+
"awards": [
|
| 114 |
+
{
|
| 115 |
+
"name": "",
|
| 116 |
+
"issuer": "",
|
| 117 |
+
"date (MM/DD/YY)": "",
|
| 118 |
+
"description": ""
|
| 119 |
+
}
|
| 120 |
+
],
|
| 121 |
+
"professional_memberships": [
|
| 122 |
+
{
|
| 123 |
+
"organization": "",
|
| 124 |
+
"role": "",
|
| 125 |
+
"start_date (MM/DD/YY)": "",
|
| 126 |
+
"end_date (MM/DD/YY)": ""
|
| 127 |
+
}
|
| 128 |
+
],
|
| 129 |
+
"additional_information": {
|
| 130 |
+
"work_authorization": "",
|
| 131 |
+
"security_clearance": "",
|
| 132 |
+
"willing_to_relocate": "",
|
| 133 |
+
"willing_to_travel": "",
|
| 134 |
+
"other_notes": ""
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
}
|
templates/school_transcripts.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"school_transcripts": {
|
| 3 |
+
"institution_information": {
|
| 4 |
+
"institution_name": "",
|
| 5 |
+
"school_or_college_name": "",
|
| 6 |
+
"campus": "",
|
| 7 |
+
"address": {
|
| 8 |
+
"street": "",
|
| 9 |
+
"apt_ste_flr": "",
|
| 10 |
+
"city": "",
|
| 11 |
+
"state": "",
|
| 12 |
+
"zip_code": "",
|
| 13 |
+
"country": ""
|
| 14 |
+
},
|
| 15 |
+
"telephone_number": "",
|
| 16 |
+
"email": "",
|
| 17 |
+
"website": ""
|
| 18 |
+
},
|
| 19 |
+
"student_information": {
|
| 20 |
+
"full_name": "",
|
| 21 |
+
"first_name": "",
|
| 22 |
+
"last_name": "",
|
| 23 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 24 |
+
"student_id": "",
|
| 25 |
+
"program_name": "",
|
| 26 |
+
"degree_type": "",
|
| 27 |
+
"major": "",
|
| 28 |
+
"minor": "",
|
| 29 |
+
"admission_date (MM/DD/YY)": "",
|
| 30 |
+
"graduation_date (MM/DD/YY)": "",
|
| 31 |
+
"is_current_student": ""
|
| 32 |
+
},
|
| 33 |
+
"transcript_details": {
|
| 34 |
+
"issue_date (MM/DD/YY)": "",
|
| 35 |
+
"has_signature": "",
|
| 36 |
+
"has_official_seal": "",
|
| 37 |
+
"is_official_transcript": "",
|
| 38 |
+
"grading_scale": "",
|
| 39 |
+
"overall_gpa": "",
|
| 40 |
+
"credits_attempted": "",
|
| 41 |
+
"credits_earned": ""
|
| 42 |
+
},
|
| 43 |
+
"remarks": {
|
| 44 |
+
"academic_standing": "",
|
| 45 |
+
"honors": "",
|
| 46 |
+
"warnings_or_probations": "",
|
| 47 |
+
"other_notes": ""
|
| 48 |
+
},
|
| 49 |
+
"signatory": {
|
| 50 |
+
"name": "",
|
| 51 |
+
"title": "",
|
| 52 |
+
"signature_date (MM/DD/YY)": ""
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
}
|
templates/us_visa.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"us_visa":
|
| 3 |
+
{
|
| 4 |
+
"number": "",
|
| 5 |
+
"control_number": "",
|
| 6 |
+
"type": "",
|
| 7 |
+
"class": "",
|
| 8 |
+
"entries": "",
|
| 9 |
+
"issue_date (MM/DD/YY)": "",
|
| 10 |
+
"expiration_date (MM/DD/YY)": "",
|
| 11 |
+
"issuing_post": "",
|
| 12 |
+
"applicant_info": {
|
| 13 |
+
"surname": "",
|
| 14 |
+
"given_names": "",
|
| 15 |
+
"date_of_birth (MM/DD/YY)": "",
|
| 16 |
+
"nationality_country_name": "",
|
| 17 |
+
"sex (Fullform)": "",
|
| 18 |
+
"passport_number": ""
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
}
|