github-actions[bot] commited on
Commit
8e52fc5
·
0 Parent(s):

Sync from GitHub

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/sync-hf.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout
13
+ uses: actions/checkout@v4
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Push to Hugging Face Space
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ HF_SPACE: pradyten/pdf-extractor
20
+ run: |
21
+ set -euo pipefail
22
+ if [ -z "${HF_TOKEN}" ]; then
23
+ echo "HF_TOKEN is not set."
24
+ exit 1
25
+ fi
26
+ sync_dir="$(mktemp -d)"
27
+ git ls-files -z | tar --null -T - -cf - | tar -xf - -C "${sync_dir}"
28
+ find "${sync_dir}" -type f -name "*.pdf" -delete
29
+ rm -rf "${sync_dir}/sample"
30
+ cd "${sync_dir}"
31
+ git init
32
+ git config user.name "github-actions[bot]"
33
+ git config user.email "github-actions[bot]@users.noreply.github.com"
34
+ git add .
35
+ git commit -m "Sync from GitHub"
36
+ git remote add hf "https://user:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE}"
37
+ git push --force hf HEAD:main
.gitignore ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ *.pdf
210
+
211
+ !sample/*.pdf
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [server]
2
+ enableCORS = false
3
+ enableXsrfProtection = false
AGENTS.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Repository Guidelines
2
+
3
+ ## Project Structure & Module Organization
4
+ - `extractor.py` contains PDF rendering, template selection, and OpenAI calls.
5
+ - `templates/` holds JSON extraction templates referenced by `TEMPLATE_REGISTRY`.
6
+ - `src/streamlit_app.py` is the Hugging Face Space UI entrypoint.
7
+ - `Dockerfile` builds the Space image (Streamlit on port 8501).
8
+ - `.streamlit/config.toml` contains Space-friendly Streamlit server settings.
9
+ - `README.md` includes Space metadata front matter and usage notes.
10
+ - The UI relies on filename keywords to select templates (see `TEMPLATE_REGISTRY`).
11
+ - Sample PDFs are fetched from the HF dataset set by `SAMPLE_DATASET_REPO`.
12
+
13
+ ## Build, Test, and Development Commands
14
+ - Install dependencies with `python -m pip install -r requirements.txt`.
15
+ - Local CLI extraction prompts for a PDF path and prints JSON:
16
+ - `python extractor.py`
17
+ - Run the Space UI locally:
18
+ - `streamlit run src/streamlit_app.py`
19
+ - Quick import sanity check:
20
+ - `python -c "import extractor; print(extractor.DEFAULT_MODEL)"`
21
+
22
+ ## Coding Style & Naming Conventions
23
+ - Keep 2-space indentation in `extractor.py`.
24
+ - Use snake_case for functions/variables, UPPER_SNAKE for constants, and add type hints for new functions.
25
+ - Template JSON filenames should be snake_case and registered via lowercase filename keywords in `TEMPLATE_REGISTRY`.
26
+
27
+ ## Testing Guidelines
28
+ - No automated test suite exists yet. If adding tests, use `pytest` under `tests/`.
29
+ - Validate that model output matches the exact template schema and that filename keywords map to the right template.
30
+
31
+ ## Commit & Pull Request Guidelines
32
+ - No established commit convention; use short, imperative subjects.
33
+ - PRs should include the document type, template files touched, example filename keyword, and any config/env changes.
34
+
35
+ ## Security & Configuration Tips
36
+ - Set `OPENAI_API_KEY` for local runs and the Space; optionally override `EXTRACTOR_MODEL_ALIAS`.
37
+ - Avoid committing sensitive PDFs or output data; use redacted samples for demos.
38
+
39
+ ## Automation
40
+ - `.github/workflows/sync-hf.yml` pushes `main` to the HF Space on each commit using `HF_TOKEN`.
41
+ - Treat GitHub as the source of truth; direct edits on HF may be overwritten.
42
+ - The workflow force-pushes a fresh snapshot to avoid blocked legacy binaries in history.
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13.5-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt ./
12
+ COPY src/ ./src/
13
+ COPY .streamlit/ ./.streamlit/
14
+ COPY extractor.py ./
15
+ COPY templates/ ./templates/
16
+
17
+ RUN pip3 install -r requirements.txt
18
+
19
+ EXPOSE 8501
20
+
21
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
+
23
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pdf Extractor
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: pdf_extractor
12
+ ---
13
+
14
+ # PDF-to-JSON Extractor with AI
15
+
16
+ Intelligent PDF document parser that extracts structured JSON data using OpenAI's GPT models and computer vision.
17
+
18
+ ## 📋 Table of Contents
19
+ - [Overview](#overview)
20
+ - [Features](#features)
21
+ - [Technology Stack](#technology-stack)
22
+ - [Installation](#installation)
23
+ - [Usage](#usage)
24
+ - [Configuration](#configuration)
25
+ - [Author](#author)
26
+
27
+ ## 🎯 Overview
28
+
29
+ This application converts PDF documents into structured JSON format using:
30
+ - **OpenAI GPT-4 Vision**: For intelligent content extraction
31
+ - **Template-based extraction**: Customizable JSON schemas for different document types
32
+ - **Streamlit UI**: Interactive web interface for easy PDF processing
33
+ - **Docker support**: Containerized deployment for production environments
34
+
35
+ Perfect for automating data extraction from resumes, invoices, forms, and other structured documents.
36
+
37
+ ## ✨ Features
38
+
39
+ - **AI-Powered Extraction**: Uses GPT-4 Vision to understand document structure
40
+ - **Template System**: Pre-configured JSON templates for common document types
41
+ - **Batch Processing**: Handle multiple PDFs efficiently
42
+ - **Image Preview**: Visual confirmation of PDF pages before extraction
43
+ - **Format Validation**: Ensures extracted JSON matches defined schema
44
+ - **Hugging Face Spaces**: Ready for cloud deployment
45
+
46
+ ## 🛠 Technology Stack
47
+
48
+ - **Python 3.9+** - Primary programming language
49
+ - **OpenAI API** - GPT-4 Vision for intelligent extraction
50
+ - **pypdfium2** - PDF rendering and image conversion
51
+ - **Streamlit** - Interactive web UI framework
52
+ - **Pillow (PIL)** - Image processing
53
+ - **Pandas** - Data manipulation
54
+
55
+ ## 🚀 Installation
56
+
57
+ ### Prerequisites
58
+ - Python 3.9 or higher
59
+ - OpenAI API key ([Get one here](https://platform.openai.com/api-keys))
60
+
61
+ ### Setup
62
+
63
+ 1. Clone the repository:
64
+ \`\`\`bash
65
+ git clone https://github.com/pradyten/pdf-extractor.git
66
+ cd pdf-extractor
67
+ \`\`\`
68
+
69
+ 2. Install dependencies:
70
+ \`\`\`bash
71
+ pip install -r requirements.txt
72
+ \`\`\`
73
+
74
+ 3. Configure OpenAI API key:
75
+ \`\`\`bash
76
+ export OPENAI_API_KEY='your-api-key-here'
77
+ \`\`\`
78
+
79
+ ## 💻 Usage
80
+
81
+ ### Command Line
82
+ \`\`\`bash
83
+ python extractor.py path/to/document.pdf
84
+ \`\`\`
85
+
86
+ ### Streamlit Web UI
87
+ \`\`\`bash
88
+ streamlit run src/streamlit_app.py
89
+ \`\`\`
90
+
91
+ ### Docker
92
+ \`\`\`bash
93
+ docker build -t pdf-extractor .
94
+ docker run -p 8501:8501 -e OPENAI_API_KEY='your-key' pdf-extractor
95
+ \`\`\`
96
+
97
+ ## ⚙️ Configuration
98
+
99
+ Define custom templates in \`extractor.py\` for different document types (resumes, invoices, forms).
100
+
101
+ ## 🎓 Use Cases
102
+
103
+ - **HR & Recruitment**: Batch process resume PDFs
104
+ - **Accounting**: Extract invoice data
105
+ - **Data Entry**: Automate form digitization
106
+ - **Document Management**: Convert scanned documents to searchable JSON
107
+
108
+ ## 🔒 Security & Privacy
109
+
110
+ - Never commit API keys - use environment variables
111
+ - PDFs are processed in-memory, not stored
112
+ - Review OpenAI's data usage policies for compliance
113
+
114
+ ## 👨‍💻 Author
115
+
116
+ **Pradyumn Tendulkar**
117
+
118
+ Data Science Graduate Student | ML Engineer
119
+
120
+ - GitHub: [@pradyten](https://github.com/pradyten)
121
+ - LinkedIn: [Pradyumn Tendulkar](https://www.linkedin.com/in/p-tendulkar/)
122
+ - Email: pktendulkar@wpi.edu
123
+
124
+ ---
125
+
126
+ ⭐ If you found this project helpful, please consider giving it a star!
127
+
128
+ 📝 **License:** MIT
extractor.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import base64
4
+ import io
5
+ from typing import Dict, Any, List, Tuple, Optional
6
+
7
+ from openai import OpenAI
8
+ import pypdfium2 as pdfium
9
+
10
+
11
+ # path to templates folder (relative to this file)
12
+ TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
13
+
14
+
15
+ TEMPLATE_REGISTRY: Dict[str, Dict[str, str]] = {
16
+ # keyword in PDF filename (lowercase) : { document_type, template_file }
17
+
18
+ # Immigration forms
19
+ "i129": {
20
+ "document_type": "USCIS Form I-129 H-1B Petition",
21
+ "template_file": "i129_h1b_petition.json",
22
+ },
23
+ "i94": {
24
+ "document_type": "Form I-94 Arrival/Departure Record",
25
+ "template_file": "i_94.json",
26
+ },
27
+ "i-94": {
28
+ "document_type": "Form I-94 Arrival/Departure Record",
29
+ "template_file": "i_94.json",
30
+ },
31
+ "i20": {
32
+ "document_type": "Form I-20 Certificate of Eligibility",
33
+ "template_file": "proof_of_in_country_status.json",
34
+ },
35
+ "i-20": {
36
+ "document_type": "Form I-20 Certificate of Eligibility",
37
+ "template_file": "proof_of_in_country_status.json",
38
+ },
39
+
40
+ # Identity documents
41
+ "passport": {
42
+ "document_type": "Passport",
43
+ "template_file": "passport.json",
44
+ },
45
+ "visa": {
46
+ "document_type": "US Visa",
47
+ "template_file": "us_visa.json",
48
+ },
49
+
50
+ # Education documents
51
+ "transcript": {
52
+ "document_type": "Academic Transcript",
53
+ "template_file": "school_transcripts.json",
54
+ },
55
+ "diploma": {
56
+ "document_type": "Diploma",
57
+ "template_file": "diplomas.json",
58
+ },
59
+
60
+ # Employment documents
61
+ "employment letter": {
62
+ "document_type": "Employment Letter",
63
+ "template_file": "employment_letter.json",
64
+ },
65
+ "offer letter": {
66
+ "document_type": "Employment Letter",
67
+ "template_file": "employment_letter.json",
68
+ },
69
+ "offer-letter": {
70
+ "document_type": "Employment Letter",
71
+ "template_file": "employment_letter.json",
72
+ },
73
+ "offer_letter": {
74
+ "document_type": "Employment Letter",
75
+ "template_file": "employment_letter.json",
76
+ },
77
+ "employment_letter": {
78
+ "document_type": "Employment Letter",
79
+ "template_file": "employment_letter.json",
80
+ },
81
+ "employment": {
82
+ "document_type": "Employment Letter",
83
+ "template_file": "employment_letter.json",
84
+ },
85
+ "resume": {
86
+ "document_type": "Resume/CV",
87
+ "template_file": "resume.json",
88
+ },
89
+ "cv": {
90
+ "document_type": "Resume/CV",
91
+ "template_file": "resume.json",
92
+ },
93
+
94
+ # Tax and corporate documents
95
+ "fein": {
96
+ "document_type": "Corporate Tax Returns",
97
+ "template_file": "corporate_tax_returns.json",
98
+ },
99
+ "cp575": {
100
+ "document_type": "Corporate Tax Returns",
101
+ "template_file": "corporate_tax_returns.json",
102
+ },
103
+ "tax": {
104
+ "document_type": "Corporate Tax Returns",
105
+ "template_file": "corporate_tax_returns.json",
106
+ },
107
+
108
+ # Personal documents
109
+ "marriage": {
110
+ "document_type": "Marriage Certificate",
111
+ "template_file": "marriage_certificate.json",
112
+ },
113
+ "marriage_certificate": {
114
+ "document_type": "Marriage Certificate",
115
+ "template_file": "marriage_certificate.json",
116
+ },
117
+
118
+ # Proof of status
119
+ "proof": {
120
+ "document_type": "Proof of In-Country Status",
121
+ "template_file": "proof_of_in_country_status.json",
122
+ },
123
+ }
124
+
125
+
126
+ # Logical model aliases for this extractor (OpenAI ChatGPT models).
127
+ ALLOWED_MODELS = [
128
+ "default",
129
+ "gpt-4.1-mini",
130
+ "gpt-4.1",
131
+ "gpt-4o-mini",
132
+ "gpt-4o",
133
+ # Legacy/dated aliases kept for compatibility.
134
+ "gpt-4.1-2025-04-14",
135
+ "gpt-4.1-mini-2025-04-14",
136
+ "gpt-5-2025-08-07",
137
+ "gpt-5-mini-2025-08-07",
138
+ ]
139
+
140
+ DEFAULT_MODEL = os.getenv("EXTRACTOR_MODEL_ALIAS", "gpt-4.1-mini")
141
+
142
+ OPENAI_API_KEY_ENV = "OPENAI_API_KEY"
143
+ _openai_client: Optional[OpenAI] = None
144
+
145
+
146
+ def load_template(template_file: str) -> Dict[str, Any]:
147
+ path = os.path.join(TEMPLATES_DIR, template_file)
148
+ if not os.path.exists(path):
149
+ raise FileNotFoundError(f"Template not found: {path}")
150
+ with open(path, "r", encoding="utf-8") as fh:
151
+ return json.load(fh)
152
+
153
+
154
+ def infer_template_from_filename(filename: str) -> Tuple[str, Dict[str, Any]]:
155
+ """
156
+ Look at the PDF file name and decide which document_type + template to use.
157
+
158
+ Example:
159
+ - 'I129 HALF.pdf' -> matches 'i129' -> uses i129_h1b_petition.json
160
+ - 'passport_rohan.pdf' -> matches 'passport' -> uses passport.json
161
+ - 'F1_visa_page1.pdf' -> matches 'visa' -> uses us_visa.json
162
+ - 'i94_record.pdf' -> matches 'i94' -> uses i_94.json
163
+ """
164
+ basename = os.path.basename(filename).lower()
165
+
166
+ for keyword, cfg in TEMPLATE_REGISTRY.items():
167
+ if keyword in basename:
168
+ document_type = cfg["document_type"]
169
+ template = load_template(cfg["template_file"])
170
+ return document_type, template
171
+
172
+ # fallback: raise to force user to add mapping or rename file
173
+ raise ValueError(
174
+ f"Could not infer document type from filename '{basename}'. "
175
+ f"Known keywords: {list(TEMPLATE_REGISTRY.keys())}"
176
+ )
177
+
178
+
179
+ def pdf_bytes_to_base64_images(pdf_bytes: bytes, max_pages: int = 10) -> List[str]:
180
+ """
181
+ Render each page of the PDF bytes to a JPEG image and return a list of
182
+ base64-encoded image strings (no data URL prefix). Limit pages by max_pages.
183
+ """
184
+ pdf = pdfium.PdfDocument(pdf_bytes)
185
+ images: List[str] = []
186
+
187
+ try:
188
+ total_pages = len(pdf)
189
+ if max_pages is not None and max_pages > 0:
190
+ page_count = min(total_pages, max_pages)
191
+ else:
192
+ page_count = total_pages
193
+
194
+ # Adaptive scale/quality to keep payloads manageable.
195
+ if page_count <= 2:
196
+ scale = 4.17 # ~300 DPI
197
+ quality = 80
198
+ elif page_count <= 10:
199
+ scale = 2.0 # ~145 DPI
200
+ quality = 60
201
+ else:
202
+ scale = 1.5 # ~110 DPI
203
+ quality = 60
204
+
205
+ for page_index in range(page_count):
206
+ page = pdf[page_index]
207
+ pil_image = page.render(scale=scale).to_pil()
208
+
209
+ buffered = io.BytesIO()
210
+ pil_image.save(buffered, format="JPEG", quality=quality)
211
+ img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
212
+ images.append(img_b64)
213
+
214
+ buffered.close()
215
+ pil_image.close()
216
+ finally:
217
+ pdf.close()
218
+
219
+ return images
220
+
221
+
222
+ def build_extraction_prompt(document_type: str, template: Dict[str, Any]) -> str:
223
+ """
224
+ Build a prompt that instructs the model to extract data into the
225
+ exact JSON structure defined by the template.
226
+ """
227
+ return f"""
228
+ You are a document data extraction system.
229
+
230
+ Document Type: {document_type}
231
+
232
+ Extract all information from the provided document image(s) and return it in the following exact JSON structure:
233
+
234
+ {json.dumps(template, indent=2)}
235
+
236
+ Instructions:
237
+ - Output only valid JSON matching exactly the structure above
238
+ - Do NOT add explanations
239
+ - Do NOT wrap the JSON in markdown, backticks, or code fences
240
+ - If a field is missing, set it to ""
241
+ - Use the exact field names; do not modify the structure
242
+ - Extract information from ALL pages
243
+ """
244
+
245
+
246
+ def _get_openai_client() -> OpenAI:
247
+ global _openai_client
248
+ if _openai_client is None:
249
+ api_key = os.getenv(OPENAI_API_KEY_ENV)
250
+ if not api_key:
251
+ raise RuntimeError(
252
+ f"{OPENAI_API_KEY_ENV} is not set. "
253
+ "Set it in your environment or CI secrets."
254
+ )
255
+ _openai_client = OpenAI(api_key=api_key)
256
+ return _openai_client
257
+
258
+
259
+ def _extract_text_from_response(response: Any) -> str:
260
+ output_text = getattr(response, "output_text", None)
261
+ if isinstance(output_text, str) and output_text.strip():
262
+ return output_text.strip()
263
+
264
+ output = getattr(response, "output", None)
265
+ if isinstance(output, list):
266
+ parts: List[str] = []
267
+ for item in output:
268
+ content = getattr(item, "content", None)
269
+ if content is None and isinstance(item, dict):
270
+ content = item.get("content")
271
+ if isinstance(content, list):
272
+ for block in content:
273
+ if isinstance(block, dict):
274
+ block_type = block.get("type")
275
+ if block_type in ("output_text", "text"):
276
+ parts.append(block.get("text", ""))
277
+ else:
278
+ block_type = getattr(block, "type", None)
279
+ if block_type in ("output_text", "text"):
280
+ parts.append(getattr(block, "text", ""))
281
+ return "".join(parts).strip()
282
+
283
+ return ""
284
+
285
+
286
+ def _invoke_openai(prompt: str, images: List[str], model: str) -> Any:
287
+ """
288
+ Call OpenAI ChatGPT with the given prompt + images and return the response.
289
+ """
290
+ client = _get_openai_client()
291
+
292
+ user_content: List[Dict[str, Any]] = [
293
+ {"type": "input_text", "text": prompt},
294
+ ]
295
+
296
+ for img_b64 in images:
297
+ user_content.append(
298
+ {
299
+ "type": "input_image",
300
+ "image_url": f"data:image/jpeg;base64,{img_b64}",
301
+ }
302
+ )
303
+
304
+ return client.responses.create(
305
+ model=model,
306
+ temperature=0,
307
+ input=[
308
+ {
309
+ "role": "system",
310
+ "content": [
311
+ {
312
+ "type": "input_text",
313
+ "text": "You are a precise document extraction engine.",
314
+ }
315
+ ],
316
+ },
317
+ {
318
+ "role": "user",
319
+ "content": user_content,
320
+ },
321
+ ],
322
+ )
323
+
324
+
325
+ def call_openai_extract(
326
+ document_type: str,
327
+ template: Dict[str, Any],
328
+ images: List[str],
329
+ model: str = DEFAULT_MODEL,
330
+ ) -> Dict[str, Any]:
331
+ """
332
+ Call OpenAI ChatGPT to extract structured JSON for the given
333
+ document type and template.
334
+ """
335
+ resolved_model = DEFAULT_MODEL if model == "default" else model
336
+
337
+ if resolved_model not in ALLOWED_MODELS:
338
+ raise ValueError(
339
+ f"Unsupported model alias '{model}'. "
340
+ f"Supported values: {ALLOWED_MODELS}. "
341
+ "This extractor uses OpenAI ChatGPT models."
342
+ )
343
+
344
+ prompt = build_extraction_prompt(document_type, template)
345
+
346
+ response = _invoke_openai(prompt, images, resolved_model)
347
+ json_str = _extract_text_from_response(response).strip()
348
+
349
+ # Strip optional markdown fences (```json ... ```)
350
+ if json_str.startswith("```"):
351
+ lines = json_str.splitlines()
352
+ if lines and lines[0].lstrip().startswith("```"):
353
+ lines = lines[1:]
354
+ if lines and lines[-1].strip().startswith("```"):
355
+ lines = lines[:-1]
356
+ json_str = "\n".join(lines).strip()
357
+
358
+ if not json_str:
359
+ raise ValueError(
360
+ "Model response did not contain any text content to parse as JSON."
361
+ )
362
+
363
+ try:
364
+ return json.loads(json_str)
365
+ except json.JSONDecodeError as exc:
366
+ snippet = json_str[:500]
367
+ raise ValueError(
368
+ f"Model output was not valid JSON: {exc}. "
369
+ f"First 500 characters of response: {snippet!r}"
370
+ ) from exc
371
+
372
+
373
+ def extract_using_openai_from_pdf_bytes(
374
+ pdf_bytes: bytes,
375
+ filename: str,
376
+ max_pages: int = 10,
377
+ model: str = DEFAULT_MODEL,
378
+ ) -> Dict[str, Any]:
379
+ """
380
+ Backwards-compatible entrypoint used by the Vision Lambda.
381
+
382
+ Despite the legacy name, this now uses OpenAI ChatGPT to perform the
383
+ extraction while preserving the JSON contract.
384
+ """
385
+ document_type, template = infer_template_from_filename(filename)
386
+ images = pdf_bytes_to_base64_images(pdf_bytes, max_pages=max_pages)
387
+ if not images:
388
+ raise RuntimeError("No images were extracted from PDF")
389
+
390
+ return call_openai_extract(document_type, template, images, model=model)
391
+
392
+
393
+ def _prompt_for_pdf_path() -> str:
394
+ """
395
+ Simple CLI helper for local runs. Web UI integrations can call
396
+ extract_using_openai_from_pdf_bytes directly instead.
397
+ """
398
+ path = input("Enter path to PDF: ").strip()
399
+ if not path:
400
+ raise SystemExit("No PDF path provided.")
401
+ return path
402
+
403
+
404
+ if __name__ == "__main__":
405
+ pdf_path = _prompt_for_pdf_path()
406
+ with open(pdf_path, "rb") as fh:
407
+ pdf_data = fh.read()
408
+ result = extract_using_openai_from_pdf_bytes(pdf_data, pdf_path)
409
+ print(json.dumps(result, ensure_ascii=False))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ altair
2
+ huggingface_hub
3
+ openai
4
+ pandas
5
+ pillow
6
+ pypdfium2
7
+ streamlit==1.29.0
src/streamlit_app.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import sys
5
+
6
+ import streamlit as st
7
+ import pypdfium2 as pdfium
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+
10
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
11
+ if ROOT_DIR not in sys.path:
12
+ sys.path.insert(0, ROOT_DIR)
13
+
14
+ from extractor import extract_using_openai_from_pdf_bytes, TEMPLATE_REGISTRY
15
+
16
+ SAMPLE_DATASET_REPO = os.getenv(
17
+ "SAMPLE_DATASET_REPO",
18
+ "pradyten/pdf-extractor-samples",
19
+ )
20
+
21
+
22
+ st.set_page_config(page_title="PDF Extractor", layout="wide")
23
+
24
+ st.markdown(
25
+ """
26
+ <style>
27
+ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;700&family=Plus+Jakarta+Sans:wght@400;500;600&display=swap');
28
+ :root {
29
+ --bg-0: #f3ede4;
30
+ --bg-1: #fbf5ea;
31
+ --panel: #ffffff;
32
+ --border: rgba(16, 24, 40, 0.12);
33
+ --text: #121212;
34
+ --muted: #5b616b;
35
+ --accent: #d4552d;
36
+ --accent-dark: #b44725;
37
+ --shadow: 0 18px 50px rgba(20, 20, 20, 0.12);
38
+ }
39
+ html, body, [data-testid="stAppViewContainer"] {
40
+ background: radial-gradient(1200px 600px at 10% -10%, var(--bg-0) 0%, #f7f2e9 45%, var(--bg-1) 100%);
41
+ color: var(--text);
42
+ font-family: "Plus Jakarta Sans", system-ui, -apple-system, "Segoe UI", sans-serif;
43
+ }
44
+ h1, h2, h3, h4, h5 {
45
+ font-family: "Space Grotesk", system-ui, -apple-system, "Segoe UI", sans-serif;
46
+ letter-spacing: -0.02em;
47
+ }
48
+ .main .block-container {
49
+ max-width: 1200px;
50
+ padding-top: 2.5rem;
51
+ padding-bottom: 3rem;
52
+ }
53
+ div[data-testid="column"] > div {
54
+ background: var(--panel);
55
+ border: 1px solid var(--border);
56
+ border-radius: 18px;
57
+ padding: 1.25rem 1.5rem 1.5rem 1.5rem;
58
+ box-shadow: var(--shadow);
59
+ }
60
+ .stButton > button {
61
+ background: var(--accent);
62
+ color: #ffffff;
63
+ border: none;
64
+ border-radius: 999px;
65
+ padding: 0.65rem 1.4rem;
66
+ font-weight: 600;
67
+ }
68
+ .stButton > button:hover {
69
+ background: var(--accent-dark);
70
+ color: #ffffff;
71
+ }
72
+ div[data-testid="stFileUploader"] {
73
+ border: 1px dashed rgba(16, 24, 40, 0.18);
74
+ border-radius: 14px;
75
+ padding: 0.6rem;
76
+ background: rgba(248, 244, 236, 0.6);
77
+ }
78
+ .stAlert {
79
+ border-radius: 12px;
80
+ }
81
+ pre, code, .stCodeBlock {
82
+ border-radius: 12px !important;
83
+ }
84
+ #MainMenu, footer {
85
+ visibility: hidden;
86
+ }
87
+ </style>
88
+ """,
89
+ unsafe_allow_html=True,
90
+ )
91
+
92
+
93
+ def _render_pdf_preview(pdf_bytes: bytes) -> None:
94
+ pdf = None
95
+ try:
96
+ pdf = pdfium.PdfDocument(pdf_bytes)
97
+ if len(pdf) < 1:
98
+ st.info("No pages found in this PDF.")
99
+ return
100
+ page = pdf[0]
101
+ pil_image = page.render(scale=2.0).to_pil()
102
+ st.image(pil_image, caption="Preview (page 1)", use_column_width=True)
103
+ except Exception as exc: # pragma: no cover - UI preview path
104
+ st.warning(f"Preview unavailable: {exc}")
105
+ finally:
106
+ if pdf is not None:
107
+ pdf.close()
108
+
109
+
110
+ def _load_pdf_state(uploaded_file) -> tuple[bytes, str, str]:
111
+ pdf_bytes = uploaded_file.getvalue()
112
+ digest = hashlib.sha256(pdf_bytes).hexdigest()
113
+ return pdf_bytes, uploaded_file.name, digest
114
+
115
+
116
+ @st.cache_data(show_spinner=False)
117
+ def _list_sample_pdfs(repo_id: str) -> list[str]:
118
+ api = HfApi()
119
+ try:
120
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
121
+ except Exception:
122
+ return []
123
+ return sorted(name for name in files if name.lower().endswith(".pdf"))
124
+
125
+
126
+ @st.cache_data(show_spinner=False)
127
+ def _load_sample_state(repo_id: str, filename: str) -> tuple[bytes, str, str]:
128
+ path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
129
+ with open(path, "rb") as fh:
130
+ pdf_bytes = fh.read()
131
+ digest = hashlib.sha256(pdf_bytes).hexdigest()
132
+ return pdf_bytes, filename, digest
133
+
134
+
135
+ def _build_download_name(filename: str) -> str:
136
+ base = os.path.splitext(filename)[0] if filename else "extraction"
137
+ safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in base)
138
+ if not safe:
139
+ safe = "extraction"
140
+ return f"{safe}_extracted.json"
141
+
142
+
143
+ def _reset_pdf_state() -> None:
144
+ st.session_state.pdf_bytes = None
145
+ st.session_state.pdf_filename = None
146
+ st.session_state.pdf_digest = None
147
+ st.session_state.extract_result = None
148
+ st.session_state.extract_error = None
149
+ st.session_state.extract_digest = None
150
+ st.session_state.extract_filename = None
151
+
152
+
153
+ def _supported_doc_types() -> list[str]:
154
+ seen = []
155
+ for cfg in TEMPLATE_REGISTRY.values():
156
+ doc_type = cfg.get("document_type")
157
+ if doc_type and doc_type not in seen:
158
+ seen.append(doc_type)
159
+ return seen
160
+
161
+
162
+ if "extract_result" not in st.session_state:
163
+ st.session_state.extract_result = None
164
+ if "extract_error" not in st.session_state:
165
+ st.session_state.extract_error = None
166
+ if "extract_digest" not in st.session_state:
167
+ st.session_state.extract_digest = None
168
+ if "extract_filename" not in st.session_state:
169
+ st.session_state.extract_filename = None
170
+ if "pdf_bytes" not in st.session_state:
171
+ st.session_state.pdf_bytes = None
172
+ if "pdf_filename" not in st.session_state:
173
+ st.session_state.pdf_filename = None
174
+ if "pdf_digest" not in st.session_state:
175
+ st.session_state.pdf_digest = None
176
+ if "input_mode_prev" not in st.session_state:
177
+ st.session_state.input_mode_prev = None
178
+
179
+
180
+ st.markdown("## PDF Extractor")
181
+ st.markdown(
182
+ "Choose a sample or upload your own PDF, preview it, then click Extract "
183
+ "to generate structured JSON on the right."
184
+ )
185
+
186
+ left, right = st.columns([1, 1], gap="large")
187
+
188
+ with left:
189
+ st.markdown("### Upload + Preview")
190
+ input_mode = st.radio(
191
+ "Input source",
192
+ ["Upload PDF", "Use sample"],
193
+ horizontal=True,
194
+ label_visibility="collapsed",
195
+ key="input_mode",
196
+ )
197
+ if st.session_state.input_mode_prev != input_mode:
198
+ _reset_pdf_state()
199
+ st.session_state.input_mode_prev = input_mode
200
+
201
+ selected_sample = None
202
+ uploaded_file = None
203
+
204
+ if input_mode == "Use sample":
205
+ sample_files = _list_sample_pdfs(SAMPLE_DATASET_REPO)
206
+ if not sample_files:
207
+ st.info("No sample PDFs found in the sample dataset yet.")
208
+ _reset_pdf_state()
209
+ sample_options = ["Choose a sample..."] + sample_files
210
+ sample_choice = st.selectbox(
211
+ "Choose a sample",
212
+ sample_options,
213
+ label_visibility="collapsed",
214
+ key="sample_choice",
215
+ )
216
+ selected_sample = sample_choice if sample_choice in sample_files else None
217
+ if selected_sample is None:
218
+ _reset_pdf_state()
219
+ else:
220
+ uploaded_file = st.file_uploader(
221
+ "Upload a PDF",
222
+ type=["pdf"],
223
+ accept_multiple_files=False,
224
+ label_visibility="collapsed",
225
+ key="pdf_uploader",
226
+ help="File name should include a known keyword (for example: resume, passport, i129).",
227
+ )
228
+
229
+ if input_mode == "Use sample" and selected_sample:
230
+ try:
231
+ pdf_bytes, filename, digest = _load_sample_state(
232
+ SAMPLE_DATASET_REPO,
233
+ selected_sample,
234
+ )
235
+ except Exception as exc: # pragma: no cover - sample load path
236
+ st.error(f"Sample load failed: {exc}")
237
+ else:
238
+ if st.session_state.pdf_digest != digest:
239
+ st.session_state.pdf_bytes = pdf_bytes
240
+ st.session_state.pdf_filename = filename
241
+ st.session_state.pdf_digest = digest
242
+ st.session_state.extract_result = None
243
+ st.session_state.extract_error = None
244
+ st.session_state.extract_digest = digest
245
+ st.session_state.extract_filename = filename
246
+
247
+ st.markdown(f"**Sample:** `{st.session_state.pdf_filename}`")
248
+ _render_pdf_preview(st.session_state.pdf_bytes)
249
+ elif input_mode == "Upload PDF" and uploaded_file is not None:
250
+ pdf_bytes, filename, digest = _load_pdf_state(uploaded_file)
251
+ if st.session_state.pdf_digest != digest:
252
+ st.session_state.pdf_bytes = pdf_bytes
253
+ st.session_state.pdf_filename = filename
254
+ st.session_state.pdf_digest = digest
255
+ st.session_state.extract_result = None
256
+ st.session_state.extract_error = None
257
+ st.session_state.extract_digest = digest
258
+ st.session_state.extract_filename = filename
259
+
260
+ st.markdown(f"**File:** `{st.session_state.pdf_filename}`")
261
+ _render_pdf_preview(st.session_state.pdf_bytes)
262
+ else:
263
+ st.info("Upload a PDF or choose a sample to preview it here.")
264
+
265
+ st.markdown("#### Notes")
266
+ st.caption(
267
+ "Template selection is inferred from the filename. If extraction fails, "
268
+ "rename the file to include a supported keyword (for example: "
269
+ "`resume.pdf`, `passport_jane.pdf`, `i129_petition.pdf`)."
270
+ )
271
+ st.caption(f"Sample dataset: `{SAMPLE_DATASET_REPO}`")
272
+ st.markdown("#### Supported documents")
273
+ st.markdown("\n".join(f"- {doc}" for doc in _supported_doc_types()))
274
+
275
+ with right:
276
+ st.markdown("### Extract")
277
+ model_choice = st.selectbox(
278
+ "Model",
279
+ ["default", "gpt-4.1-mini", "gpt-4.1", "gpt-4o-mini", "gpt-4o"],
280
+ index=1,
281
+ help="Choose a model or use default (EXTRACTOR_MODEL_ALIAS).",
282
+ )
283
+
284
+ has_api_key = bool(os.getenv("OPENAI_API_KEY"))
285
+ if not has_api_key:
286
+ st.warning("OPENAI_API_KEY is not set. Add it to your environment or Space secrets.")
287
+
288
+ extract_clicked = st.button(
289
+ "Extract",
290
+ use_container_width=False,
291
+ disabled=st.session_state.pdf_bytes is None or not has_api_key,
292
+ )
293
+
294
+ if extract_clicked:
295
+ with st.spinner("Extracting structured JSON..."):
296
+ try:
297
+ result = extract_using_openai_from_pdf_bytes(
298
+ st.session_state.pdf_bytes,
299
+ st.session_state.pdf_filename,
300
+ model=model_choice,
301
+ )
302
+ st.session_state.extract_result = result
303
+ st.session_state.extract_error = None
304
+ except Exception as exc: # pragma: no cover - runtime error path
305
+ message = str(exc)
306
+ if "403" in message or "PermissionDenied" in message:
307
+ message = (
308
+ "OpenAI request was rejected (403). "
309
+ "Check OPENAI_API_KEY, model access, and billing."
310
+ )
311
+ st.session_state.extract_error = message
312
+ st.session_state.extract_result = None
313
+
314
+ if st.session_state.extract_error:
315
+ st.error(st.session_state.extract_error)
316
+
317
+ if st.session_state.extract_result is None:
318
+ st.info("Extraction output will appear here.")
319
+ else:
320
+ st.markdown("#### JSON Output")
321
+ json_text = json.dumps(
322
+ st.session_state.extract_result,
323
+ indent=2,
324
+ ensure_ascii=False,
325
+ )
326
+ st.code(json_text, language="json")
327
+ st.download_button(
328
+ "Download JSON",
329
+ data=json_text,
330
+ file_name=_build_download_name(st.session_state.pdf_filename or ""),
331
+ mime="application/json",
332
+ )
templates/corporate_tax_returns.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "legal_business_name": "",
3
+ "trade_name_dba": "",
4
+ "contact_person": {
5
+ "first_name": "",
6
+ "last_name": "",
7
+ "title": ""
8
+ },
9
+ "business_address": {
10
+ "street": "",
11
+ "apt_ste_flr": "",
12
+ "city": "",
13
+ "state": "",
14
+ "zip_code": ""
15
+ },
16
+ "telephone_number": "",
17
+ "naics_code": "",
18
+ "type_of_business": "",
19
+ "federal_employer_identification_number": "",
20
+ "gross_annual_income": "",
21
+ "net_annual_income": "",
22
+ "company_signatories": [
23
+ {
24
+ "name": "",
25
+ "title": "",
26
+ "signature_date (MM/DD/YY)": ""
27
+ }
28
+ ],
29
+ "tax_year": "",
30
+ "form_type": ""
31
+ }
templates/diplomas.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "diploma": {
3
+ "institution_information": {
4
+ "institution_name": "",
5
+ "school_or_college_name": "",
6
+ "campus": "",
7
+ "address": {
8
+ "street": "",
9
+ "apt_ste_flr": "",
10
+ "city": "",
11
+ "state": "",
12
+ "zip_code": "",
13
+ "country": ""
14
+ },
15
+ "telephone_number": "",
16
+ "email": "",
17
+ "website": ""
18
+ },
19
+ "student_information": {
20
+ "full_name": "",
21
+ "first_name": "",
22
+ "last_name": "",
23
+ "date_of_birth (MM/DD/YY)": "",
24
+ "student_id": ""
25
+ },
26
+ "diploma_details": {
27
+ "has_signature": "",
28
+ "has_official_seal": "",
29
+ "issue_date (MM/DD/YY)": "",
30
+ "degree_type": "",
31
+ "major": "",
32
+ "minor": "",
33
+ "concentration": "",
34
+ "honors": "",
35
+ "program_length": ""
36
+ },
37
+ "signatories": [
38
+ {
39
+ "name": "",
40
+ "title": "",
41
+ "signature_date (MM/DD/YY)": ""
42
+ }
43
+ ],
44
+ "document_metadata": {
45
+ "diploma_number": "",
46
+ "serial_number": "",
47
+ "language": "",
48
+ "format": ""
49
+ }
50
+ }
51
+ }
templates/employment_letter.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "employment_letter": {
3
+ "letter_type": "",
4
+ "letter_date (MM/DD/YY)": "",
5
+ "employee_information": {
6
+ "full_name": "",
7
+ "first_name": "",
8
+ "last_name": "",
9
+ "date_of_birth (MM/DD/YY)": "",
10
+ "employee_id": "",
11
+ "job_title": "",
12
+ "department": ""
13
+ },
14
+ "employment_details": {
15
+ "employment_status": "",
16
+ "employment_start_date (MM/DD/YY)": "",
17
+ "employment_end_date (MM/DD/YY)": "",
18
+ "is_currently_employed": "",
19
+ "work_schedule": "",
20
+ "hours_per_week": "",
21
+ "full_time (yes/no)": "",
22
+ "salary": "",
23
+ "salary_frequency (year/month/etc.)": "",
24
+ "bonus_or_variable_pay": "",
25
+ "job_duties_summary": ""
26
+ },
27
+ "employer_information": {
28
+ "legal_business_name": "",
29
+ "trade_name_dba": "",
30
+ "business_address": {
31
+ "street": "",
32
+ "apt_ste_flr": "",
33
+ "city": "",
34
+ "state": "",
35
+ "zip_code": "",
36
+ "country": ""
37
+ },
38
+ "telephone_number": "",
39
+ "email": "",
40
+ "website": ""
41
+ },
42
+ "supervisor_or_hr_contact": {
43
+ "name": "",
44
+ "title": "",
45
+ "phone_number": "",
46
+ "email": ""
47
+ },
48
+ "work_location": [
49
+ {
50
+ "street": "",
51
+ "apt_ste_flr": "",
52
+ "city": "",
53
+ "state": "",
54
+ "zip_code": "",
55
+ "is_third_party_location": "",
56
+ "third_party_name": ""
57
+ }
58
+ ],
59
+ "has_signature": "",
60
+ "signatory": {
61
+ "name": "",
62
+ "title": "",
63
+ "signature_date (MM/DD/YY)": ""
64
+ }
65
+ }
66
+ }
templates/i129_h1b_petition.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "i129_h1b_petition": {
3
+ "has_signature": "",
4
+ "form_edition": "",
5
+ "petition_type": "H-1B",
6
+
7
+ "petitioner_information": {
8
+ "petitioner_is_individual": "",
9
+ "individual_petitioner": {
10
+ "family_name": "",
11
+ "given_name": "",
12
+ "middle_name": ""
13
+ },
14
+ "company_information": {
15
+ "company_name": "",
16
+ "fein": "",
17
+ "is_nonprofit_or_government_research_org": "",
18
+ "number_of_employees_in_us": "",
19
+ "year_established": "",
20
+ "type_of_business": "",
21
+ "gross_annual_income": "",
22
+ "net_annual_income": ""
23
+ },
24
+ "mailing_address": {
25
+ "in_care_of": "",
26
+ "street": "",
27
+ "apt_ste_flr": "",
28
+ "city": "",
29
+ "state": "",
30
+ "zip_code": "",
31
+ "province": "",
32
+ "postal_code": "",
33
+ "country": ""
34
+ },
35
+ "contact_information": {
36
+ "daytime_phone": "",
37
+ "mobile_phone": "",
38
+ "email": ""
39
+ }
40
+ },
41
+
42
+ "petition_information": {
43
+ "requested_classification_symbol": "",
44
+ "basis_for_classification": "",
45
+ "most_recent_receipt_number": "",
46
+ "requested_action": "",
47
+ "total_workers_in_petition": ""
48
+ },
49
+
50
+ "beneficiary_information": {
51
+ "type_of_beneficiary": "",
52
+ "group_name_if_entertainment": "",
53
+ "full_name": {
54
+ "family_name": "",
55
+ "given_name": "",
56
+ "middle_name": ""
57
+ },
58
+ "other_names_used": "",
59
+ "date_of_birth": "",
60
+ "sex": "",
61
+ "country_of_birth": "",
62
+ "province_of_birth": "",
63
+ "country_of_citizenship": "",
64
+ "alien_number": "",
65
+ "ssn": "",
66
+ "itin": "",
67
+ "passport": {
68
+ "number": "",
69
+ "country_of_issuance": "",
70
+ "date_issued": "",
71
+ "date_expires": ""
72
+ },
73
+ "i94_number": "",
74
+ "date_of_last_arrival": "",
75
+ "current_us_address": {
76
+ "street": "",
77
+ "apt_ste_flr": "",
78
+ "city": "",
79
+ "state": "",
80
+ "zip_code": ""
81
+ },
82
+ "current_nonimmigrant_status": "",
83
+ "status_expiration_date": "",
84
+ "sevis_number": "",
85
+ "ead_number": "",
86
+ "foreign_address": {
87
+ "street": "",
88
+ "apt_ste_flr": "",
89
+ "city": "",
90
+ "state_province": "",
91
+ "postal_code": "",
92
+ "country": ""
93
+ }
94
+ },
95
+
96
+ "processing_information": {
97
+ "consulate_or_poe_to_notify": {
98
+ "type": "",
99
+ "city": "",
100
+ "state_or_country": ""
101
+ },
102
+ "beneficiary_has_valid_passport": "",
103
+ "applications_for_dependents_filed": "",
104
+ "other_petitions_filed": "",
105
+ "applications_for_replacement_i94": "",
106
+ "beneficiary_in_removal_proceedings": "",
107
+ "previous_nonimmigrant_petitions_filed_for_beneficiary": "",
108
+ "previous_immigrant_petitions_filed": "",
109
+ "is_new_petition": "",
110
+ "beneficiary_previously_held_same_classification_last_7_years": "",
111
+ "beneficiary_denied_same_classification_last_7_years": "",
112
+ "beneficiary_j1_or_j2_history": "",
113
+ "j1_j2_status_dates": ""
114
+ },
115
+
116
+ "employment_information": {
117
+ "job_title": "",
118
+ "lca_or_ETA_case_number": "",
119
+ "worksite_addresses": [
120
+ {
121
+ "street": "",
122
+ "apt_ste_flr": "",
123
+ "city": "",
124
+ "state": "",
125
+ "zip_code": "",
126
+ "is_third_party_location": "",
127
+ "third_party_name": ""
128
+ }
129
+ ],
130
+ "wages": {
131
+ "amount": "",
132
+ "frequency": ""
133
+ },
134
+ "other_compensation": "",
135
+ "employment_dates": {
136
+ "from": "",
137
+ "to": ""
138
+ },
139
+ "full_time_position": "",
140
+ "hours_per_week": "",
141
+ "will_work_in_cnmi": "",
142
+ "itinerary_included": ""
143
+ },
144
+
145
+ "export_control_certification": {
146
+ "license_not_required": "",
147
+ "license_required": ""
148
+ },
149
+
150
+ "h1b_classification_details": {
151
+ "prior_H_or_L_periods_of_stay": [],
152
+ "confirmation_number_h1b_registration": "",
153
+ "passport_used_for_registration": {
154
+ "number": "",
155
+ "country_of_issuance": "",
156
+ "expiration_date": ""
157
+ },
158
+ "proposed_duties_description": "",
159
+ "beneficiary_present_occupation_and_experience": "",
160
+ "beneficiary_controlling_interest_in_petitioner": "",
161
+ "controlling_interest_explanation": "",
162
+ "requesting_change_of_employer_and_was_previous_CNMI_exempt": "",
163
+ "subject_to_CNMI_cap_exemption": ""
164
+ },
165
+
166
+ "h1b_fee_and_dependency_information": {
167
+ "petitioner_is_h1b_dependent": "",
168
+ "petitioner_is_willful_violator": "",
169
+ "beneficiary_is_exempt_from_attestation": "",
170
+ "exemption_reason_salary_over_60000": "",
171
+ "exemption_reason_master_or_higher_degree": "",
172
+ "petitioner_has_50_or_more_employees": "",
173
+ "more_than_50_percent_in_H1B_L1_status": "",
174
+ "beneficiary_highest_education_level": "",
175
+ "field_of_study": "",
176
+ "rate_of_pay_per_year": "",
177
+ "dot_code": "",
178
+ "naics_code": "",
179
+ "acwia_fee_applicable": "",
180
+ "fraud_fee_applicable": "",
181
+ "public_law_114_113_fee_applicable": ""
182
+ },
183
+
184
+ "numerical_cap_information": {
185
+ "cap_type": "",
186
+ "masters_cap_degree_details": {
187
+ "us_institution_name": "",
188
+ "degree_type": "",
189
+ "date_awarded": "",
190
+ "institution_address": ""
191
+ },
192
+ "reason_for_cap_exemption": ""
193
+ },
194
+
195
+ "offsite_assignment_information": {
196
+ "assigned_offsite": "",
197
+ "complies_with_offsite_h1b_requirements": "",
198
+ "paid_actual_or_prevailing_wage_at_offsite": ""
199
+ },
200
+
201
+ "signatory_information": {
202
+ "authorized_signatory_name": "",
203
+ "title": "",
204
+ "signature_date": "",
205
+ "daytime_phone": "",
206
+ "email": ""
207
+ },
208
+
209
+ "preparer_information": {
210
+ "prepared_by_someone_else": "",
211
+ "preparer_name": "",
212
+ "preparer_business_name": "",
213
+ "preparer_address": "",
214
+ "preparer_contact": {
215
+ "daytime_phone": "",
216
+ "fax": "",
217
+ "email": ""
218
+ },
219
+ "preparer_signature_date": ""
220
+ },
221
+
222
+ "additional_information": ""
223
+ }
224
+ }
templates/i_94.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "I-94": {
3
+ "Record_number": "",
4
+ "Recent_date_of_entry (MM/DD/YY)": "",
5
+ "class_of_admission": "",
6
+ "admit_until_date (MM/DD/YY)": "",
7
+ "last_name": "",
8
+ "first_name": "",
9
+ "date_of_birth (MM/DD/YY)": "",
10
+ "passport_number": "",
11
+ "country_of_issue": "",
12
+ "gender (Fullform)": ""
13
+ }
14
+ }
templates/marriage_certificate.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "marriage_certificate":
3
+ {
4
+ "certificate_number": "",
5
+ "registration_number": "",
6
+ "issue_date (MM/DD/YY)": "",
7
+ "issuing_authority": "",
8
+ "issuing_jurisdiction": {
9
+ "city": "",
10
+ "county": "",
11
+ "state": "",
12
+ "country": ""
13
+ },
14
+ "marriage_date (MM/DD/YY)": "",
15
+ "marriage_location": {
16
+ "venue_name": "",
17
+ "city": "",
18
+ "county": "",
19
+ "state": "",
20
+ "country": ""
21
+ },
22
+ "Bride": {
23
+ "first_name": "",
24
+ "last_name": "",
25
+ "date_of_birth (MM/DD/YY)": "",
26
+ "place_of_birth": {
27
+ "city": "",
28
+ "state": "",
29
+ "country": ""
30
+ },
31
+ "residence": {
32
+ "city": "",
33
+ "state": "",
34
+ "country": ""
35
+ },
36
+ "father_name": "",
37
+ "mother_name": ""
38
+ },
39
+ "spouse_2": {
40
+ "first_name": "",
41
+ "last_name": "",
42
+ "date_of_birth (MM/DD/YY)": "",
43
+ "place_of_birth": {
44
+ "city": "",
45
+ "state": "",
46
+ "country": ""
47
+ },
48
+ "residence": {
49
+ "city": "",
50
+ "state": "",
51
+ "country": ""
52
+ },
53
+ "father_name": "",
54
+ "mother_name": ""
55
+ },
56
+ "officiant_info": {
57
+ "name": "",
58
+ "title": "",
59
+ "organization": ""
60
+ },
61
+ "witness_1": {
62
+ "name": "",
63
+ "address": ""
64
+ },
65
+ "witness_2": {
66
+ "name": "",
67
+ "address": ""
68
+ }
69
+ }
70
+ }
templates/passport.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "passport": {
3
+ "number": "",
4
+ "type": "",
5
+ "country_code" : "",
6
+ "surname": "",
7
+ "given_name": "",
8
+ "nationality": "",
9
+ "citizenship_country_name": "",
10
+ "date_of_birth (MM/DD/YY)": "",
11
+ "citizenship_number": "",
12
+ "sex (Fullform)": "",
13
+ "province_of_birth": "",
14
+ "country_of_birth": "",
15
+ "issue_date (MM/DD/YY)": "",
16
+ "expiration_date (MM/DD/YY)": "",
17
+ "issuing_country_name": "",
18
+ "fathers_name": ""
19
+ }
20
+ }
templates/proof_of_in_country_status.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"I-20":
2
+ {
3
+ "sevis_id": "",
4
+ "student_info": {
5
+ "surname_primary_name": "",
6
+ "given_name": "",
7
+ "preferred_name": "",
8
+ "passport_name": "",
9
+ "country_of_birth": "",
10
+ "country_of_citizenship": "",
11
+ "city_of_birth": "",
12
+ "date_of_birth (MM/DD/YY)": "",
13
+ "class_of_admission": ""
14
+ },
15
+ "school_information": {
16
+ "school_name": "",
17
+ "school_address": "",
18
+ "school_official_to_contact_upon_arrival": "",
19
+ "school_code_and_approval_date (MM/DD/YY)": "",
20
+ "admission_number": ""
21
+ },
22
+ "program_of_study": {
23
+ "education_level": "",
24
+ "major_1": "",
25
+ "major_2": "",
26
+ "program_english_proficiency": "",
27
+ "english_proficiency_notes": "",
28
+ "earliest_admission_date (MM/DD/YY)": "",
29
+ "start_of_classes (MM/DD/YY)": "",
30
+ "program_start_end_date (MM/DD/YY)": "",
31
+ "form_issue_reason": ""
32
+ },
33
+ "financials": {
34
+ "estimated_average_costs_9_months": {
35
+ "tuition_and_fees": "",
36
+ "living_expenses": "",
37
+ "expenses_of_dependents": "",
38
+ "books_health_insurance": "",
39
+ "total": ""
40
+ },
41
+ "students_funding_9_months": {
42
+ "personal_funds": "",
43
+ "funds_from_this_school": "",
44
+ "abroad_family_member": "",
45
+ "on_campus_employment": "",
46
+ "total": ""
47
+ }
48
+ },
49
+ "school_attestation": {
50
+ "signature_of": "",
51
+ "date_issued (MM/DD/YY)": "",
52
+ "place_issued": ""
53
+ },
54
+ "student_attestation": {
55
+ "signature_of": "",
56
+ "date (MM/DD/YY)": ""
57
+ },
58
+ "employment_authorizations": {
59
+ "current_session_start_date (MM/DD/YY)": "",
60
+ "current_session_end_date (MM/DD/YY)": ""
61
+ },
62
+ "travel_endorsement": {
63
+ "designated_school_official": "",
64
+ "title": "",
65
+ "date_issued (MM/DD/YY)": "",
66
+ "place_issued": ""
67
+ }
68
+ }
69
+ }
templates/resume.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resume": {
3
+ "personal_information": {
4
+ "full_name": "",
5
+ "first_name": "",
6
+ "last_name": "",
7
+ "email": "",
8
+ "phone_number": "",
9
+ "linkedin_url": "",
10
+ "github_url": "",
11
+ "website_url": "",
12
+ "address": {
13
+ "street": "",
14
+ "apt_ste_flr": "",
15
+ "city": "",
16
+ "state": "",
17
+ "zip_code": "",
18
+ "country": ""
19
+ }
20
+ },
21
+ "professional_title": "",
22
+ "summary": "",
23
+ "work_experience": [
24
+ {
25
+ "job_title": "",
26
+ "employer_name": "",
27
+ "employment_type": "",
28
+ "location_city": "",
29
+ "location_state": "",
30
+ "location_country": "",
31
+ "start_date (MM/DD/YY)": "",
32
+ "end_date (MM/DD/YY)": "",
33
+ "is_current_role": "",
34
+ "description": "",
35
+ "achievements": [
36
+ ""
37
+ ],
38
+ "technologies_used": [
39
+ ""
40
+ ]
41
+ }
42
+ ],
43
+ "education": [
44
+ {
45
+ "institution_name": "",
46
+ "degree": "",
47
+ "field_of_study": "",
48
+ "start_date (MM/DD/YY)": "",
49
+ "end_date (MM/DD/YY)": "",
50
+ "is_current_program": "",
51
+ "gpa": "",
52
+ "location_city": "",
53
+ "location_state": "",
54
+ "location_country": ""
55
+ }
56
+ ],
57
+ "skills": {
58
+ "technical_skills": [
59
+ ""
60
+ ],
61
+ "soft_skills": [
62
+ ""
63
+ ],
64
+ "tools_and_technologies": [
65
+ ""
66
+ ],
67
+ "languages": [
68
+ {
69
+ "language": "",
70
+ "proficiency": ""
71
+ }
72
+ ]
73
+ },
74
+ "certifications": [
75
+ {
76
+ "name": "",
77
+ "issuer": "",
78
+ "issue_date (MM/DD/YY)": "",
79
+ "expiration_date (MM/DD/YY)": "",
80
+ "credential_id": "",
81
+ "credential_url": ""
82
+ }
83
+ ],
84
+ "projects": [
85
+ {
86
+ "name": "",
87
+ "role": "",
88
+ "start_date (MM/DD/YY)": "",
89
+ "end_date (MM/DD/YY)": "",
90
+ "is_current_project": "",
91
+ "description": "",
92
+ "responsibilities": [
93
+ ""
94
+ ],
95
+ "technologies_used": [
96
+ ""
97
+ ],
98
+ "project_url": ""
99
+ }
100
+ ],
101
+ "publications": [
102
+ {
103
+ "title": "",
104
+ "venue": "",
105
+ "publication_date (MM/DD/YY)": "",
106
+ "authors": [
107
+ ""
108
+ ],
109
+ "doi": "",
110
+ "url": ""
111
+ }
112
+ ],
113
+ "awards": [
114
+ {
115
+ "name": "",
116
+ "issuer": "",
117
+ "date (MM/DD/YY)": "",
118
+ "description": ""
119
+ }
120
+ ],
121
+ "professional_memberships": [
122
+ {
123
+ "organization": "",
124
+ "role": "",
125
+ "start_date (MM/DD/YY)": "",
126
+ "end_date (MM/DD/YY)": ""
127
+ }
128
+ ],
129
+ "additional_information": {
130
+ "work_authorization": "",
131
+ "security_clearance": "",
132
+ "willing_to_relocate": "",
133
+ "willing_to_travel": "",
134
+ "other_notes": ""
135
+ }
136
+ }
137
+ }
templates/school_transcripts.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "school_transcripts": {
3
+ "institution_information": {
4
+ "institution_name": "",
5
+ "school_or_college_name": "",
6
+ "campus": "",
7
+ "address": {
8
+ "street": "",
9
+ "apt_ste_flr": "",
10
+ "city": "",
11
+ "state": "",
12
+ "zip_code": "",
13
+ "country": ""
14
+ },
15
+ "telephone_number": "",
16
+ "email": "",
17
+ "website": ""
18
+ },
19
+ "student_information": {
20
+ "full_name": "",
21
+ "first_name": "",
22
+ "last_name": "",
23
+ "date_of_birth (MM/DD/YY)": "",
24
+ "student_id": "",
25
+ "program_name": "",
26
+ "degree_type": "",
27
+ "major": "",
28
+ "minor": "",
29
+ "admission_date (MM/DD/YY)": "",
30
+ "graduation_date (MM/DD/YY)": "",
31
+ "is_current_student": ""
32
+ },
33
+ "transcript_details": {
34
+ "issue_date (MM/DD/YY)": "",
35
+ "has_signature": "",
36
+ "has_official_seal": "",
37
+ "is_official_transcript": "",
38
+ "grading_scale": "",
39
+ "overall_gpa": "",
40
+ "credits_attempted": "",
41
+ "credits_earned": ""
42
+ },
43
+ "remarks": {
44
+ "academic_standing": "",
45
+ "honors": "",
46
+ "warnings_or_probations": "",
47
+ "other_notes": ""
48
+ },
49
+ "signatory": {
50
+ "name": "",
51
+ "title": "",
52
+ "signature_date (MM/DD/YY)": ""
53
+ }
54
+ }
55
+ }
templates/us_visa.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "us_visa":
3
+ {
4
+ "number": "",
5
+ "control_number": "",
6
+ "type": "",
7
+ "class": "",
8
+ "entries": "",
9
+ "issue_date (MM/DD/YY)": "",
10
+ "expiration_date (MM/DD/YY)": "",
11
+ "issuing_post": "",
12
+ "applicant_info": {
13
+ "surname": "",
14
+ "given_names": "",
15
+ "date_of_birth (MM/DD/YY)": "",
16
+ "nationality_country_name": "",
17
+ "sex (Fullform)": "",
18
+ "passport_number": ""
19
+ }
20
+ }
21
+ }