Spaces:

pradyten
/

pdf-extractor

Running

App Files Files Community

github-actions[bot] commited on 6 days ago

Commit

8e52fc5

0 Parent(s):

Sync from GitHub

Browse files

Files changed (21) hide show

.gitattributes +35 -0
.github/workflows/sync-hf.yml +37 -0
.gitignore +211 -0
.streamlit/config.toml +3 -0
AGENTS.md +42 -0
Dockerfile +23 -0
README.md +128 -0
extractor.py +409 -0
requirements.txt +7 -0
src/streamlit_app.py +332 -0
templates/corporate_tax_returns.json +31 -0
templates/diplomas.json +51 -0
templates/employment_letter.json +66 -0
templates/i129_h1b_petition.json +224 -0
templates/i_94.json +14 -0
templates/marriage_certificate.json +70 -0
templates/passport.json +20 -0
templates/proof_of_in_country_status.json +69 -0
templates/resume.json +137 -0
templates/school_transcripts.json +55 -0
templates/us_visa.json +21 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync-hf.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Sync Hugging Face Space
+on:
+  push:
+    branches:
+      - main
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_SPACE: pradyten/pdf-extractor
+        run: |
+          set -euo pipefail
+          if [ -z "${HF_TOKEN}" ]; then
+            echo "HF_TOKEN is not set."
+            exit 1
+          fi
+          sync_dir="$(mktemp -d)"
+          git ls-files -z | tar --null -T - -cf - | tar -xf - -C "${sync_dir}"
+          find "${sync_dir}" -type f -name "*.pdf" -delete
+          rm -rf "${sync_dir}/sample"
+          cd "${sync_dir}"
+          git init
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add .
+          git commit -m "Sync from GitHub"
+          git remote add hf "https://user:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE}"
+          git push --force hf HEAD:main

.gitignore ADDED Viewed

	@@ -0,0 +1,211 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+*.pdf
+!sample/*.pdf

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[server]
+enableCORS = false
+enableXsrfProtection = false

AGENTS.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Repository Guidelines
+## Project Structure & Module Organization
+- `extractor.py` contains PDF rendering, template selection, and OpenAI calls.
+- `templates/` holds JSON extraction templates referenced by `TEMPLATE_REGISTRY`.
+- `src/streamlit_app.py` is the Hugging Face Space UI entrypoint.
+- `Dockerfile` builds the Space image (Streamlit on port 8501).
+- `.streamlit/config.toml` contains Space-friendly Streamlit server settings.
+- `README.md` includes Space metadata front matter and usage notes.
+- The UI relies on filename keywords to select templates (see `TEMPLATE_REGISTRY`).
+- Sample PDFs are fetched from the HF dataset set by `SAMPLE_DATASET_REPO`.
+## Build, Test, and Development Commands
+- Install dependencies with `python -m pip install -r requirements.txt`.
+- Local CLI extraction prompts for a PDF path and prints JSON:
+  - `python extractor.py`
+- Run the Space UI locally:
+  - `streamlit run src/streamlit_app.py`
+- Quick import sanity check:
+  - `python -c "import extractor; print(extractor.DEFAULT_MODEL)"`
+## Coding Style & Naming Conventions
+- Keep 2-space indentation in `extractor.py`.
+- Use snake_case for functions/variables, UPPER_SNAKE for constants, and add type hints for new functions.
+- Template JSON filenames should be snake_case and registered via lowercase filename keywords in `TEMPLATE_REGISTRY`.
+## Testing Guidelines
+- No automated test suite exists yet. If adding tests, use `pytest` under `tests/`.
+- Validate that model output matches the exact template schema and that filename keywords map to the right template.
+## Commit & Pull Request Guidelines
+- No established commit convention; use short, imperative subjects.
+- PRs should include the document type, template files touched, example filename keyword, and any config/env changes.
+## Security & Configuration Tips
+- Set `OPENAI_API_KEY` for local runs and the Space; optionally override `EXTRACTOR_MODEL_ALIAS`.
+- Avoid committing sensitive PDFs or output data; use redacted samples for demos.
+## Automation
+- `.github/workflows/sync-hf.yml` pushes `main` to the HF Space on each commit using `HF_TOKEN`.
+- Treat GitHub as the source of truth; direct edits on HF may be overwritten.
+- The workflow force-pushes a fresh snapshot to avoid blocked legacy binaries in history.

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+COPY .streamlit/ ./.streamlit/
+COPY extractor.py ./
+COPY templates/ ./templates/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,128 @@

+---
+title: Pdf Extractor
+emoji: 🚀
+colorFrom: red
+colorTo: red
+sdk: docker
+app_port: 8501
+tags:
+- streamlit
+pinned: false
+short_description: pdf_extractor
+---
+# PDF-to-JSON Extractor with AI
+Intelligent PDF document parser that extracts structured JSON data using OpenAI's GPT models and computer vision.
+## 📋 Table of Contents
+- [Overview](#overview)
+- [Features](#features)
+- [Technology Stack](#technology-stack)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Configuration](#configuration)
+- [Author](#author)
+## 🎯 Overview
+This application converts PDF documents into structured JSON format using:
+- **OpenAI GPT-4 Vision**: For intelligent content extraction
+- **Template-based extraction**: Customizable JSON schemas for different document types
+- **Streamlit UI**: Interactive web interface for easy PDF processing
+- **Docker support**: Containerized deployment for production environments
+Perfect for automating data extraction from resumes, invoices, forms, and other structured documents.
+## ✨ Features
+- **AI-Powered Extraction**: Uses GPT-4 Vision to understand document structure
+- **Template System**: Pre-configured JSON templates for common document types
+- **Batch Processing**: Handle multiple PDFs efficiently
+- **Image Preview**: Visual confirmation of PDF pages before extraction
+- **Format Validation**: Ensures extracted JSON matches defined schema
+- **Hugging Face Spaces**: Ready for cloud deployment
+## 🛠 Technology Stack
+- **Python 3.9+** - Primary programming language
+- **OpenAI API** - GPT-4 Vision for intelligent extraction
+- **pypdfium2** - PDF rendering and image conversion
+- **Streamlit** - Interactive web UI framework
+- **Pillow (PIL)** - Image processing
+- **Pandas** - Data manipulation
+## 🚀 Installation
+### Prerequisites
+- Python 3.9 or higher
+- OpenAI API key ([Get one here](https://platform.openai.com/api-keys))
+### Setup
+1. Clone the repository:
+\`\`\`bash
+git clone https://github.com/pradyten/pdf-extractor.git
+cd pdf-extractor
+\`\`\`
+2. Install dependencies:
+\`\`\`bash
+pip install -r requirements.txt
+\`\`\`
+3. Configure OpenAI API key:
+\`\`\`bash
+export OPENAI_API_KEY='your-api-key-here'
+\`\`\`
+## 💻 Usage
+### Command Line
+\`\`\`bash
+python extractor.py path/to/document.pdf
+\`\`\`
+### Streamlit Web UI
+\`\`\`bash
+streamlit run src/streamlit_app.py
+\`\`\`
+### Docker
+\`\`\`bash
+docker build -t pdf-extractor .
+docker run -p 8501:8501 -e OPENAI_API_KEY='your-key' pdf-extractor
+\`\`\`
+## ⚙️ Configuration
+Define custom templates in \`extractor.py\` for different document types (resumes, invoices, forms).
+## 🎓 Use Cases
+- **HR & Recruitment**: Batch process resume PDFs
+- **Accounting**: Extract invoice data
+- **Data Entry**: Automate form digitization
+- **Document Management**: Convert scanned documents to searchable JSON
+## 🔒 Security & Privacy
+- Never commit API keys - use environment variables
+- PDFs are processed in-memory, not stored
+- Review OpenAI's data usage policies for compliance
+## 👨‍💻 Author
+**Pradyumn Tendulkar**
+Data Science Graduate Student | ML Engineer
+- GitHub: [@pradyten](https://github.com/pradyten)
+- LinkedIn: [Pradyumn Tendulkar](https://www.linkedin.com/in/p-tendulkar/)
+- Email: pktendulkar@wpi.edu
+---
+⭐ If you found this project helpful, please consider giving it a star!
+📝 **License:** MIT

extractor.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import os
+import json
+import base64
+import io
+from typing import Dict, Any, List, Tuple, Optional
+from openai import OpenAI
+import pypdfium2 as pdfium
+# path to templates folder (relative to this file)
+TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
+TEMPLATE_REGISTRY: Dict[str, Dict[str, str]] = {
+  # keyword in PDF filename (lowercase) : { document_type, template_file }
+  # Immigration forms
+  "i129": {
+    "document_type": "USCIS Form I-129 H-1B Petition",
+    "template_file": "i129_h1b_petition.json",
+  },
+  "i94": {
+    "document_type": "Form I-94 Arrival/Departure Record",
+    "template_file": "i_94.json",
+  },
+  "i-94": {
+    "document_type": "Form I-94 Arrival/Departure Record",
+    "template_file": "i_94.json",
+  },
+  "i20": {
+    "document_type": "Form I-20 Certificate of Eligibility",
+    "template_file": "proof_of_in_country_status.json",
+  },
+  "i-20": {
+    "document_type": "Form I-20 Certificate of Eligibility",
+    "template_file": "proof_of_in_country_status.json",
+  },
+  # Identity documents
+  "passport": {
+    "document_type": "Passport",
+    "template_file": "passport.json",
+  },
+  "visa": {
+    "document_type": "US Visa",
+    "template_file": "us_visa.json",
+  },
+  # Education documents
+  "transcript": {
+    "document_type": "Academic Transcript",
+    "template_file": "school_transcripts.json",
+  },
+  "diploma": {
+    "document_type": "Diploma",
+    "template_file": "diplomas.json",
+  },
+  # Employment documents
+  "employment letter": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "offer letter": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "offer-letter": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "offer_letter": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "employment_letter": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "employment": {
+    "document_type": "Employment Letter",
+    "template_file": "employment_letter.json",
+  },
+  "resume": {
+    "document_type": "Resume/CV",
+    "template_file": "resume.json",
+  },
+  "cv": {
+    "document_type": "Resume/CV",
+    "template_file": "resume.json",
+  },
+  # Tax and corporate documents
+  "fein": {
+    "document_type": "Corporate Tax Returns",
+    "template_file": "corporate_tax_returns.json",
+  },
+  "cp575": {
+    "document_type": "Corporate Tax Returns",
+    "template_file": "corporate_tax_returns.json",
+  },
+  "tax": {
+    "document_type": "Corporate Tax Returns",
+    "template_file": "corporate_tax_returns.json",
+  },
+  # Personal documents
+  "marriage": {
+    "document_type": "Marriage Certificate",
+    "template_file": "marriage_certificate.json",
+  },
+  "marriage_certificate": {
+    "document_type": "Marriage Certificate",
+    "template_file": "marriage_certificate.json",
+  },
+  # Proof of status
+  "proof": {
+    "document_type": "Proof of In-Country Status",
+    "template_file": "proof_of_in_country_status.json",
+  },
+}
+# Logical model aliases for this extractor (OpenAI ChatGPT models).
+ALLOWED_MODELS = [
+  "default",
+  "gpt-4.1-mini",
+  "gpt-4.1",
+  "gpt-4o-mini",
+  "gpt-4o",
+  # Legacy/dated aliases kept for compatibility.
+  "gpt-4.1-2025-04-14",
+  "gpt-4.1-mini-2025-04-14",
+  "gpt-5-2025-08-07",
+  "gpt-5-mini-2025-08-07",
+]
+DEFAULT_MODEL = os.getenv("EXTRACTOR_MODEL_ALIAS", "gpt-4.1-mini")
+OPENAI_API_KEY_ENV = "OPENAI_API_KEY"
+_openai_client: Optional[OpenAI] = None
+def load_template(template_file: str) -> Dict[str, Any]:
+  path = os.path.join(TEMPLATES_DIR, template_file)
+  if not os.path.exists(path):
+    raise FileNotFoundError(f"Template not found: {path}")
+  with open(path, "r", encoding="utf-8") as fh:
+    return json.load(fh)
+def infer_template_from_filename(filename: str) -> Tuple[str, Dict[str, Any]]:
+  """
+  Look at the PDF file name and decide which document_type + template to use.
+  Example:
+    - 'I129 HALF.pdf'      -> matches 'i129' -> uses i129_h1b_petition.json
+    - 'passport_rohan.pdf' -> matches 'passport' -> uses passport.json
+    - 'F1_visa_page1.pdf'  -> matches 'visa' -> uses us_visa.json
+    - 'i94_record.pdf'     -> matches 'i94' -> uses i_94.json
+  """
+  basename = os.path.basename(filename).lower()
+  for keyword, cfg in TEMPLATE_REGISTRY.items():
+    if keyword in basename:
+      document_type = cfg["document_type"]
+      template = load_template(cfg["template_file"])
+      return document_type, template
+  # fallback: raise to force user to add mapping or rename file
+  raise ValueError(
+    f"Could not infer document type from filename '{basename}'. "
+    f"Known keywords: {list(TEMPLATE_REGISTRY.keys())}"
+  )
+def pdf_bytes_to_base64_images(pdf_bytes: bytes, max_pages: int = 10) -> List[str]:
+  """
+  Render each page of the PDF bytes to a JPEG image and return a list of
+  base64-encoded image strings (no data URL prefix). Limit pages by max_pages.
+  """
+  pdf = pdfium.PdfDocument(pdf_bytes)
+  images: List[str] = []
+  try:
+    total_pages = len(pdf)
+    if max_pages is not None and max_pages > 0:
+      page_count = min(total_pages, max_pages)
+    else:
+      page_count = total_pages
+    # Adaptive scale/quality to keep payloads manageable.
+    if page_count <= 2:
+      scale = 4.17   # ~300 DPI
+      quality = 80
+    elif page_count <= 10:
+      scale = 2.0    # ~145 DPI
+      quality = 60
+    else:
+      scale = 1.5    # ~110 DPI
+      quality = 60
+    for page_index in range(page_count):
+      page = pdf[page_index]
+      pil_image = page.render(scale=scale).to_pil()
+      buffered = io.BytesIO()
+      pil_image.save(buffered, format="JPEG", quality=quality)
+      img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+      images.append(img_b64)
+      buffered.close()
+      pil_image.close()
+  finally:
+    pdf.close()
+  return images
+def build_extraction_prompt(document_type: str, template: Dict[str, Any]) -> str:
+  """
+  Build a prompt that instructs the model to extract data into the
+  exact JSON structure defined by the template.
+  """
+  return f"""
+You are a document data extraction system.
+Document Type: {document_type}
+Extract all information from the provided document image(s) and return it in the following exact JSON structure:
+{json.dumps(template, indent=2)}
+Instructions:
+- Output only valid JSON matching exactly the structure above
+- Do NOT add explanations
+- Do NOT wrap the JSON in markdown, backticks, or code fences
+- If a field is missing, set it to ""
+- Use the exact field names; do not modify the structure
+- Extract information from ALL pages
+"""
+def _get_openai_client() -> OpenAI:
+  global _openai_client
+  if _openai_client is None:
+    api_key = os.getenv(OPENAI_API_KEY_ENV)
+    if not api_key:
+      raise RuntimeError(
+        f"{OPENAI_API_KEY_ENV} is not set. "
+        "Set it in your environment or CI secrets."
+      )
+    _openai_client = OpenAI(api_key=api_key)
+  return _openai_client
+def _extract_text_from_response(response: Any) -> str:
+  output_text = getattr(response, "output_text", None)
+  if isinstance(output_text, str) and output_text.strip():
+    return output_text.strip()
+  output = getattr(response, "output", None)
+  if isinstance(output, list):
+    parts: List[str] = []
+    for item in output:
+      content = getattr(item, "content", None)
+      if content is None and isinstance(item, dict):
+        content = item.get("content")
+      if isinstance(content, list):
+        for block in content:
+          if isinstance(block, dict):
+            block_type = block.get("type")
+            if block_type in ("output_text", "text"):
+              parts.append(block.get("text", ""))
+          else:
+            block_type = getattr(block, "type", None)
+            if block_type in ("output_text", "text"):
+              parts.append(getattr(block, "text", ""))
+    return "".join(parts).strip()
+  return ""
+def _invoke_openai(prompt: str, images: List[str], model: str) -> Any:
+  """
+  Call OpenAI ChatGPT with the given prompt + images and return the response.
+  """
+  client = _get_openai_client()
+  user_content: List[Dict[str, Any]] = [
+    {"type": "input_text", "text": prompt},
+  ]
+  for img_b64 in images:
+    user_content.append(
+      {
+        "type": "input_image",
+        "image_url": f"data:image/jpeg;base64,{img_b64}",
+      }
+    )
+  return client.responses.create(
+    model=model,
+    temperature=0,
+    input=[
+      {
+        "role": "system",
+        "content": [
+          {
+            "type": "input_text",
+            "text": "You are a precise document extraction engine.",
+          }
+        ],
+      },
+      {
+        "role": "user",
+        "content": user_content,
+      },
+    ],
+  )
+def call_openai_extract(
+  document_type: str,
+  template: Dict[str, Any],
+  images: List[str],
+  model: str = DEFAULT_MODEL,
+) -> Dict[str, Any]:
+  """
+  Call OpenAI ChatGPT to extract structured JSON for the given
+  document type and template.
+  """
+  resolved_model = DEFAULT_MODEL if model == "default" else model
+  if resolved_model not in ALLOWED_MODELS:
+    raise ValueError(
+      f"Unsupported model alias '{model}'. "
+      f"Supported values: {ALLOWED_MODELS}. "
+      "This extractor uses OpenAI ChatGPT models."
+    )
+  prompt = build_extraction_prompt(document_type, template)
+  response = _invoke_openai(prompt, images, resolved_model)
+  json_str = _extract_text_from_response(response).strip()
+  # Strip optional markdown fences (```json ... ```)
+  if json_str.startswith("```"):
+    lines = json_str.splitlines()
+    if lines and lines[0].lstrip().startswith("```"):
+      lines = lines[1:]
+    if lines and lines[-1].strip().startswith("```"):
+      lines = lines[:-1]
+    json_str = "\n".join(lines).strip()
+  if not json_str:
+    raise ValueError(
+      "Model response did not contain any text content to parse as JSON."
+    )
+  try:
+    return json.loads(json_str)
+  except json.JSONDecodeError as exc:
+    snippet = json_str[:500]
+    raise ValueError(
+      f"Model output was not valid JSON: {exc}. "
+      f"First 500 characters of response: {snippet!r}"
+    ) from exc
+def extract_using_openai_from_pdf_bytes(
+  pdf_bytes: bytes,
+  filename: str,
+  max_pages: int = 10,
+  model: str = DEFAULT_MODEL,
+) -> Dict[str, Any]:
+  """
+  Backwards-compatible entrypoint used by the Vision Lambda.
+  Despite the legacy name, this now uses OpenAI ChatGPT to perform the
+  extraction while preserving the JSON contract.
+  """
+  document_type, template = infer_template_from_filename(filename)
+  images = pdf_bytes_to_base64_images(pdf_bytes, max_pages=max_pages)
+  if not images:
+    raise RuntimeError("No images were extracted from PDF")
+  return call_openai_extract(document_type, template, images, model=model)
+def _prompt_for_pdf_path() -> str:
+  """
+  Simple CLI helper for local runs. Web UI integrations can call
+  extract_using_openai_from_pdf_bytes directly instead.
+  """
+  path = input("Enter path to PDF: ").strip()
+  if not path:
+    raise SystemExit("No PDF path provided.")
+  return path
+if __name__ == "__main__":
+  pdf_path = _prompt_for_pdf_path()
+  with open(pdf_path, "rb") as fh:
+    pdf_data = fh.read()
+  result = extract_using_openai_from_pdf_bytes(pdf_data, pdf_path)
+  print(json.dumps(result, ensure_ascii=False))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+altair
+huggingface_hub
+openai
+pandas
+pillow
+pypdfium2
+streamlit==1.29.0

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import hashlib
+import json
+import os
+import sys
+import streamlit as st
+import pypdfium2 as pdfium
+from huggingface_hub import HfApi, hf_hub_download
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if ROOT_DIR not in sys.path:
+  sys.path.insert(0, ROOT_DIR)
+from extractor import extract_using_openai_from_pdf_bytes, TEMPLATE_REGISTRY
+SAMPLE_DATASET_REPO = os.getenv(
+  "SAMPLE_DATASET_REPO",
+  "pradyten/pdf-extractor-samples",
+)
+st.set_page_config(page_title="PDF Extractor", layout="wide")
+st.markdown(
+  """
+  <style>
+  @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;700&family=Plus+Jakarta+Sans:wght@400;500;600&display=swap');
+  :root {
+    --bg-0: #f3ede4;
+    --bg-1: #fbf5ea;
+    --panel: #ffffff;
+    --border: rgba(16, 24, 40, 0.12);
+    --text: #121212;
+    --muted: #5b616b;
+    --accent: #d4552d;
+    --accent-dark: #b44725;
+    --shadow: 0 18px 50px rgba(20, 20, 20, 0.12);
+  }
+  html, body, [data-testid="stAppViewContainer"] {
+    background: radial-gradient(1200px 600px at 10% -10%, var(--bg-0) 0%, #f7f2e9 45%, var(--bg-1) 100%);
+    color: var(--text);
+    font-family: "Plus Jakarta Sans", system-ui, -apple-system, "Segoe UI", sans-serif;
+  }
+  h1, h2, h3, h4, h5 {
+    font-family: "Space Grotesk", system-ui, -apple-system, "Segoe UI", sans-serif;
+    letter-spacing: -0.02em;
+  }
+  .main .block-container {
+    max-width: 1200px;
+    padding-top: 2.5rem;
+    padding-bottom: 3rem;
+  }
+  div[data-testid="column"] > div {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 18px;
+    padding: 1.25rem 1.5rem 1.5rem 1.5rem;
+    box-shadow: var(--shadow);
+  }
+  .stButton > button {
+    background: var(--accent);
+    color: #ffffff;
+    border: none;
+    border-radius: 999px;
+    padding: 0.65rem 1.4rem;
+    font-weight: 600;
+  }
+  .stButton > button:hover {
+    background: var(--accent-dark);
+    color: #ffffff;
+  }
+  div[data-testid="stFileUploader"] {
+    border: 1px dashed rgba(16, 24, 40, 0.18);
+    border-radius: 14px;
+    padding: 0.6rem;
+    background: rgba(248, 244, 236, 0.6);
+  }
+  .stAlert {
+    border-radius: 12px;
+  }
+  pre, code, .stCodeBlock {
+    border-radius: 12px !important;
+  }
+  #MainMenu, footer {
+    visibility: hidden;
+  }
+  </style>
+  """,
+  unsafe_allow_html=True,
+)
+def _render_pdf_preview(pdf_bytes: bytes) -> None:
+  pdf = None
+  try:
+    pdf = pdfium.PdfDocument(pdf_bytes)
+    if len(pdf) < 1:
+      st.info("No pages found in this PDF.")
+      return
+    page = pdf[0]
+    pil_image = page.render(scale=2.0).to_pil()
+    st.image(pil_image, caption="Preview (page 1)", use_column_width=True)
+  except Exception as exc:  # pragma: no cover - UI preview path
+    st.warning(f"Preview unavailable: {exc}")
+  finally:
+    if pdf is not None:
+      pdf.close()
+def _load_pdf_state(uploaded_file) -> tuple[bytes, str, str]:
+  pdf_bytes = uploaded_file.getvalue()
+  digest = hashlib.sha256(pdf_bytes).hexdigest()
+  return pdf_bytes, uploaded_file.name, digest
+@st.cache_data(show_spinner=False)
+def _list_sample_pdfs(repo_id: str) -> list[str]:
+  api = HfApi()
+  try:
+    files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+  except Exception:
+    return []
+  return sorted(name for name in files if name.lower().endswith(".pdf"))
+@st.cache_data(show_spinner=False)
+def _load_sample_state(repo_id: str, filename: str) -> tuple[bytes, str, str]:
+  path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
+  with open(path, "rb") as fh:
+    pdf_bytes = fh.read()
+  digest = hashlib.sha256(pdf_bytes).hexdigest()
+  return pdf_bytes, filename, digest
+def _build_download_name(filename: str) -> str:
+  base = os.path.splitext(filename)[0] if filename else "extraction"
+  safe = "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in base)
+  if not safe:
+    safe = "extraction"
+  return f"{safe}_extracted.json"
+def _reset_pdf_state() -> None:
+  st.session_state.pdf_bytes = None
+  st.session_state.pdf_filename = None
+  st.session_state.pdf_digest = None
+  st.session_state.extract_result = None
+  st.session_state.extract_error = None
+  st.session_state.extract_digest = None
+  st.session_state.extract_filename = None
+def _supported_doc_types() -> list[str]:
+  seen = []
+  for cfg in TEMPLATE_REGISTRY.values():
+    doc_type = cfg.get("document_type")
+    if doc_type and doc_type not in seen:
+      seen.append(doc_type)
+  return seen
+if "extract_result" not in st.session_state:
+  st.session_state.extract_result = None
+if "extract_error" not in st.session_state:
+  st.session_state.extract_error = None
+if "extract_digest" not in st.session_state:
+  st.session_state.extract_digest = None
+if "extract_filename" not in st.session_state:
+  st.session_state.extract_filename = None
+if "pdf_bytes" not in st.session_state:
+  st.session_state.pdf_bytes = None
+if "pdf_filename" not in st.session_state:
+  st.session_state.pdf_filename = None
+if "pdf_digest" not in st.session_state:
+  st.session_state.pdf_digest = None
+if "input_mode_prev" not in st.session_state:
+  st.session_state.input_mode_prev = None
+st.markdown("## PDF Extractor")
+st.markdown(
+  "Choose a sample or upload your own PDF, preview it, then click Extract "
+  "to generate structured JSON on the right."
+)
+left, right = st.columns([1, 1], gap="large")
+with left:
+  st.markdown("### Upload + Preview")
+  input_mode = st.radio(
+    "Input source",
+    ["Upload PDF", "Use sample"],
+    horizontal=True,
+    label_visibility="collapsed",
+    key="input_mode",
+  )
+  if st.session_state.input_mode_prev != input_mode:
+    _reset_pdf_state()
+    st.session_state.input_mode_prev = input_mode
+  selected_sample = None
+  uploaded_file = None
+  if input_mode == "Use sample":
+    sample_files = _list_sample_pdfs(SAMPLE_DATASET_REPO)
+    if not sample_files:
+      st.info("No sample PDFs found in the sample dataset yet.")
+      _reset_pdf_state()
+    sample_options = ["Choose a sample..."] + sample_files
+    sample_choice = st.selectbox(
+      "Choose a sample",
+      sample_options,
+      label_visibility="collapsed",
+      key="sample_choice",
+    )
+    selected_sample = sample_choice if sample_choice in sample_files else None
+    if selected_sample is None:
+      _reset_pdf_state()
+  else:
+    uploaded_file = st.file_uploader(
+      "Upload a PDF",
+      type=["pdf"],
+      accept_multiple_files=False,
+      label_visibility="collapsed",
+      key="pdf_uploader",
+      help="File name should include a known keyword (for example: resume, passport, i129).",
+    )
+  if input_mode == "Use sample" and selected_sample:
+    try:
+      pdf_bytes, filename, digest = _load_sample_state(
+        SAMPLE_DATASET_REPO,
+        selected_sample,
+      )
+    except Exception as exc:  # pragma: no cover - sample load path
+      st.error(f"Sample load failed: {exc}")
+    else:
+      if st.session_state.pdf_digest != digest:
+        st.session_state.pdf_bytes = pdf_bytes
+        st.session_state.pdf_filename = filename
+        st.session_state.pdf_digest = digest
+        st.session_state.extract_result = None
+        st.session_state.extract_error = None
+        st.session_state.extract_digest = digest
+        st.session_state.extract_filename = filename
+      st.markdown(f"**Sample:** `{st.session_state.pdf_filename}`")
+      _render_pdf_preview(st.session_state.pdf_bytes)
+  elif input_mode == "Upload PDF" and uploaded_file is not None:
+    pdf_bytes, filename, digest = _load_pdf_state(uploaded_file)
+    if st.session_state.pdf_digest != digest:
+      st.session_state.pdf_bytes = pdf_bytes
+      st.session_state.pdf_filename = filename
+      st.session_state.pdf_digest = digest
+      st.session_state.extract_result = None
+      st.session_state.extract_error = None
+      st.session_state.extract_digest = digest
+      st.session_state.extract_filename = filename
+    st.markdown(f"**File:** `{st.session_state.pdf_filename}`")
+    _render_pdf_preview(st.session_state.pdf_bytes)
+  else:
+    st.info("Upload a PDF or choose a sample to preview it here.")
+  st.markdown("#### Notes")
+  st.caption(
+    "Template selection is inferred from the filename. If extraction fails, "
+    "rename the file to include a supported keyword (for example: "
+    "`resume.pdf`, `passport_jane.pdf`, `i129_petition.pdf`)."
+  )
+  st.caption(f"Sample dataset: `{SAMPLE_DATASET_REPO}`")
+  st.markdown("#### Supported documents")
+  st.markdown("\n".join(f"- {doc}" for doc in _supported_doc_types()))
+with right:
+  st.markdown("### Extract")
+  model_choice = st.selectbox(
+    "Model",
+    ["default", "gpt-4.1-mini", "gpt-4.1", "gpt-4o-mini", "gpt-4o"],
+    index=1,
+    help="Choose a model or use default (EXTRACTOR_MODEL_ALIAS).",
+  )
+  has_api_key = bool(os.getenv("OPENAI_API_KEY"))
+  if not has_api_key:
+    st.warning("OPENAI_API_KEY is not set. Add it to your environment or Space secrets.")
+  extract_clicked = st.button(
+    "Extract",
+    use_container_width=False,
+    disabled=st.session_state.pdf_bytes is None or not has_api_key,
+  )
+  if extract_clicked:
+    with st.spinner("Extracting structured JSON..."):
+      try:
+        result = extract_using_openai_from_pdf_bytes(
+          st.session_state.pdf_bytes,
+          st.session_state.pdf_filename,
+          model=model_choice,
+        )
+        st.session_state.extract_result = result
+        st.session_state.extract_error = None
+      except Exception as exc:  # pragma: no cover - runtime error path
+        message = str(exc)
+        if "403" in message or "PermissionDenied" in message:
+          message = (
+            "OpenAI request was rejected (403). "
+            "Check OPENAI_API_KEY, model access, and billing."
+          )
+        st.session_state.extract_error = message
+        st.session_state.extract_result = None
+  if st.session_state.extract_error:
+    st.error(st.session_state.extract_error)
+  if st.session_state.extract_result is None:
+    st.info("Extraction output will appear here.")
+  else:
+    st.markdown("#### JSON Output")
+    json_text = json.dumps(
+      st.session_state.extract_result,
+      indent=2,
+      ensure_ascii=False,
+    )
+    st.code(json_text, language="json")
+    st.download_button(
+      "Download JSON",
+      data=json_text,
+      file_name=_build_download_name(st.session_state.pdf_filename or ""),
+      mime="application/json",
+    )

templates/corporate_tax_returns.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "legal_business_name": "",
+    "trade_name_dba": "",
+    "contact_person": {
+        "first_name": "",
+        "last_name": "",
+        "title": ""
+    },
+    "business_address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": ""
+    },
+    "telephone_number": "",
+    "naics_code": "",
+    "type_of_business": "",
+    "federal_employer_identification_number": "",
+    "gross_annual_income": "",
+    "net_annual_income": "",
+    "company_signatories": [
+        {
+            "name": "",
+            "title": "",
+            "signature_date (MM/DD/YY)": ""
+        }
+    ],
+    "tax_year": "",
+    "form_type": ""
+}

templates/diplomas.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "diploma": {
+    "institution_information": {
+      "institution_name": "",
+      "school_or_college_name": "",
+      "campus": "",
+      "address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "country": ""
+      },
+      "telephone_number": "",
+      "email": "",
+      "website": ""
+    },
+    "student_information": {
+      "full_name": "",
+      "first_name": "",
+      "last_name": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "student_id": ""
+    },
+    "diploma_details": {
+      "has_signature": "",
+      "has_official_seal": "",
+      "issue_date (MM/DD/YY)": "",
+      "degree_type": "",
+      "major": "",
+      "minor": "",
+      "concentration": "",
+      "honors": "",
+      "program_length": ""
+    },
+    "signatories": [
+      {
+        "name": "",
+        "title": "",
+        "signature_date (MM/DD/YY)": ""
+      }
+    ],
+    "document_metadata": {
+      "diploma_number": "",
+      "serial_number": "",
+      "language": "",
+      "format": ""
+    }
+  }
+}

templates/employment_letter.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "employment_letter": {
+    "letter_type": "",
+    "letter_date (MM/DD/YY)": "",
+    "employee_information": {
+      "full_name": "",
+      "first_name": "",
+      "last_name": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "employee_id": "",
+      "job_title": "",
+      "department": ""
+    },
+    "employment_details": {
+      "employment_status": "",
+      "employment_start_date (MM/DD/YY)": "",
+      "employment_end_date (MM/DD/YY)": "",
+      "is_currently_employed": "",
+      "work_schedule": "",
+      "hours_per_week": "",
+      "full_time (yes/no)": "",
+      "salary": "",
+      "salary_frequency (year/month/etc.)": "",
+      "bonus_or_variable_pay": "",
+      "job_duties_summary": ""
+    },
+    "employer_information": {
+      "legal_business_name": "",
+      "trade_name_dba": "",
+      "business_address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "country": ""
+      },
+      "telephone_number": "",
+      "email": "",
+      "website": ""
+    },
+    "supervisor_or_hr_contact": {
+      "name": "",
+      "title": "",
+      "phone_number": "",
+      "email": ""
+    },
+    "work_location": [
+      {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "is_third_party_location": "",
+        "third_party_name": ""
+      }
+    ],
+    "has_signature": "",
+    "signatory": {
+      "name": "",
+      "title": "",
+      "signature_date (MM/DD/YY)": ""
+    }
+  }
+}

templates/i129_h1b_petition.json ADDED Viewed

	@@ -0,0 +1,224 @@

+{
+  "i129_h1b_petition": {
+    "has_signature": "",
+    "form_edition": "",
+    "petition_type": "H-1B",
+    "petitioner_information": {
+      "petitioner_is_individual": "",
+      "individual_petitioner": {
+        "family_name": "",
+        "given_name": "",
+        "middle_name": ""
+      },
+      "company_information": {
+        "company_name": "",
+        "fein": "",
+        "is_nonprofit_or_government_research_org": "",
+        "number_of_employees_in_us": "",
+        "year_established": "",
+        "type_of_business": "",
+        "gross_annual_income": "",
+        "net_annual_income": ""
+      },
+      "mailing_address": {
+        "in_care_of": "",
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "province": "",
+        "postal_code": "",
+        "country": ""
+      },
+      "contact_information": {
+        "daytime_phone": "",
+        "mobile_phone": "",
+        "email": ""
+      }
+    },
+    "petition_information": {
+      "requested_classification_symbol": "",
+      "basis_for_classification": "",
+      "most_recent_receipt_number": "",
+      "requested_action": "",
+      "total_workers_in_petition": ""
+    },
+    "beneficiary_information": {
+      "type_of_beneficiary": "",
+      "group_name_if_entertainment": "",
+      "full_name": {
+        "family_name": "",
+        "given_name": "",
+        "middle_name": ""
+      },
+      "other_names_used": "",
+      "date_of_birth": "",
+      "sex": "",
+      "country_of_birth": "",
+      "province_of_birth": "",
+      "country_of_citizenship": "",
+      "alien_number": "",
+      "ssn": "",
+      "itin": "",
+      "passport": {
+        "number": "",
+        "country_of_issuance": "",
+        "date_issued": "",
+        "date_expires": ""
+      },
+      "i94_number": "",
+      "date_of_last_arrival": "",
+      "current_us_address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": ""
+      },
+      "current_nonimmigrant_status": "",
+      "status_expiration_date": "",
+      "sevis_number": "",
+      "ead_number": "",
+      "foreign_address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state_province": "",
+        "postal_code": "",
+        "country": ""
+      }
+    },
+    "processing_information": {
+      "consulate_or_poe_to_notify": {
+        "type": "",
+        "city": "",
+        "state_or_country": ""
+      },
+      "beneficiary_has_valid_passport": "",
+      "applications_for_dependents_filed": "",
+      "other_petitions_filed": "",
+      "applications_for_replacement_i94": "",
+      "beneficiary_in_removal_proceedings": "",
+      "previous_nonimmigrant_petitions_filed_for_beneficiary": "",
+      "previous_immigrant_petitions_filed": "",
+      "is_new_petition": "",
+      "beneficiary_previously_held_same_classification_last_7_years": "",
+      "beneficiary_denied_same_classification_last_7_years": "",
+      "beneficiary_j1_or_j2_history": "",
+      "j1_j2_status_dates": ""
+    },
+    "employment_information": {
+      "job_title": "",
+      "lca_or_ETA_case_number": "",
+      "worksite_addresses": [
+        {
+          "street": "",
+          "apt_ste_flr": "",
+          "city": "",
+          "state": "",
+          "zip_code": "",
+          "is_third_party_location": "",
+          "third_party_name": ""
+        }
+      ],
+      "wages": {
+        "amount": "",
+        "frequency": ""
+      },
+      "other_compensation": "",
+      "employment_dates": {
+        "from": "",
+        "to": ""
+      },
+      "full_time_position": "",
+      "hours_per_week": "",
+      "will_work_in_cnmi": "",
+      "itinerary_included": ""
+    },
+    "export_control_certification": {
+      "license_not_required": "",
+      "license_required": ""
+    },
+    "h1b_classification_details": {
+      "prior_H_or_L_periods_of_stay": [],
+      "confirmation_number_h1b_registration": "",
+      "passport_used_for_registration": {
+        "number": "",
+        "country_of_issuance": "",
+        "expiration_date": ""
+      },
+      "proposed_duties_description": "",
+      "beneficiary_present_occupation_and_experience": "",
+      "beneficiary_controlling_interest_in_petitioner": "",
+      "controlling_interest_explanation": "",
+      "requesting_change_of_employer_and_was_previous_CNMI_exempt": "",
+      "subject_to_CNMI_cap_exemption": ""
+    },
+    "h1b_fee_and_dependency_information": {
+      "petitioner_is_h1b_dependent": "",
+      "petitioner_is_willful_violator": "",
+      "beneficiary_is_exempt_from_attestation": "",
+      "exemption_reason_salary_over_60000": "",
+      "exemption_reason_master_or_higher_degree": "",
+      "petitioner_has_50_or_more_employees": "",
+      "more_than_50_percent_in_H1B_L1_status": "",
+      "beneficiary_highest_education_level": "",
+      "field_of_study": "",
+      "rate_of_pay_per_year": "",
+      "dot_code": "",
+      "naics_code": "",
+      "acwia_fee_applicable": "",
+      "fraud_fee_applicable": "",
+      "public_law_114_113_fee_applicable": ""
+    },
+    "numerical_cap_information": {
+      "cap_type": "",
+      "masters_cap_degree_details": {
+        "us_institution_name": "",
+        "degree_type": "",
+        "date_awarded": "",
+        "institution_address": ""
+      },
+      "reason_for_cap_exemption": ""
+    },
+    "offsite_assignment_information": {
+      "assigned_offsite": "",
+      "complies_with_offsite_h1b_requirements": "",
+      "paid_actual_or_prevailing_wage_at_offsite": ""
+    },
+    "signatory_information": {
+      "authorized_signatory_name": "",
+      "title": "",
+      "signature_date": "",
+      "daytime_phone": "",
+      "email": ""
+    },
+    "preparer_information": {
+      "prepared_by_someone_else": "",
+      "preparer_name": "",
+      "preparer_business_name": "",
+      "preparer_address": "",
+      "preparer_contact": {
+        "daytime_phone": "",
+        "fax": "",
+        "email": ""
+      },
+      "preparer_signature_date": ""
+    },
+    "additional_information": ""
+  }
+}

templates/i_94.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "I-94": {
+    "Record_number": "",
+    "Recent_date_of_entry (MM/DD/YY)": "",
+    "class_of_admission": "",
+    "admit_until_date (MM/DD/YY)": "",
+    "last_name": "",
+    "first_name": "",
+    "date_of_birth (MM/DD/YY)": "",
+    "passport_number": "",
+    "country_of_issue": "",
+    "gender (Fullform)": ""
+  }
+}

templates/marriage_certificate.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+"marriage_certificate":
+{
+    "certificate_number": "",
+    "registration_number": "",
+    "issue_date (MM/DD/YY)": "",
+    "issuing_authority": "",
+    "issuing_jurisdiction": {
+      "city": "",
+      "county": "",
+      "state": "",
+      "country": ""
+    },
+    "marriage_date (MM/DD/YY)": "",
+    "marriage_location": {
+      "venue_name": "",
+      "city": "",
+      "county": "",
+      "state": "",
+      "country": ""
+    },
+    "Bride": {
+      "first_name": "",
+      "last_name": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "place_of_birth": {
+        "city": "",
+        "state": "",
+        "country": ""
+      },
+      "residence": {
+        "city": "",
+        "state": "",
+        "country": ""
+      },
+      "father_name": "",
+      "mother_name": ""
+    },
+    "spouse_2": {
+      "first_name": "",
+      "last_name": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "place_of_birth": {
+        "city": "",
+        "state": "",
+        "country": ""
+      },
+      "residence": {
+        "city": "",
+        "state": "",
+        "country": ""
+      },
+      "father_name": "",
+      "mother_name": ""
+    },
+    "officiant_info": {
+      "name": "",
+      "title": "",
+      "organization": ""
+    },
+    "witness_1": {
+      "name": "",
+      "address": ""
+    },
+    "witness_2": {
+      "name": "",
+      "address": ""
+    }
+  }
+}

templates/passport.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "passport": {
+        "number": "",
+        "type": "",
+        "country_code" : "",
+        "surname": "",
+        "given_name": "",
+        "nationality": "",
+        "citizenship_country_name": "",
+        "date_of_birth (MM/DD/YY)": "",
+        "citizenship_number": "",
+        "sex (Fullform)": "",
+        "province_of_birth": "",
+        "country_of_birth": "",
+        "issue_date (MM/DD/YY)": "",
+        "expiration_date (MM/DD/YY)": "",
+        "issuing_country_name": "",
+        "fathers_name": ""
+    }
+}

templates/proof_of_in_country_status.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{"I-20":
+    {
+      "sevis_id": "",
+      "student_info": {
+        "surname_primary_name": "",
+        "given_name": "",
+        "preferred_name": "",
+        "passport_name": "",
+        "country_of_birth": "",
+        "country_of_citizenship": "",
+        "city_of_birth": "",
+        "date_of_birth (MM/DD/YY)": "",
+        "class_of_admission": ""
+      },
+      "school_information": {
+        "school_name": "",
+        "school_address": "",
+        "school_official_to_contact_upon_arrival": "",
+        "school_code_and_approval_date (MM/DD/YY)": "",
+        "admission_number": ""
+      },
+      "program_of_study": {
+        "education_level": "",
+        "major_1": "",
+        "major_2": "",
+        "program_english_proficiency": "",
+        "english_proficiency_notes": "",
+        "earliest_admission_date (MM/DD/YY)": "",
+        "start_of_classes (MM/DD/YY)": "",
+        "program_start_end_date (MM/DD/YY)": "",
+        "form_issue_reason": ""
+      },
+      "financials": {
+        "estimated_average_costs_9_months": {
+          "tuition_and_fees": "",
+          "living_expenses": "",
+          "expenses_of_dependents": "",
+          "books_health_insurance": "",
+          "total": ""
+        },
+        "students_funding_9_months": {
+          "personal_funds": "",
+          "funds_from_this_school": "",
+          "abroad_family_member": "",
+          "on_campus_employment": "",
+          "total": ""
+        }
+      },
+      "school_attestation": {
+        "signature_of": "",
+        "date_issued (MM/DD/YY)": "",
+        "place_issued": ""
+      },
+      "student_attestation": {
+        "signature_of": "",
+        "date (MM/DD/YY)": ""
+      },
+      "employment_authorizations": {
+        "current_session_start_date (MM/DD/YY)": "",
+        "current_session_end_date (MM/DD/YY)": ""
+      },
+      "travel_endorsement": {
+        "designated_school_official": "",
+        "title": "",
+        "date_issued (MM/DD/YY)": "",
+        "place_issued": ""
+      }
+    }
+}

templates/resume.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+  "resume": {
+    "personal_information": {
+      "full_name": "",
+      "first_name": "",
+      "last_name": "",
+      "email": "",
+      "phone_number": "",
+      "linkedin_url": "",
+      "github_url": "",
+      "website_url": "",
+      "address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "country": ""
+      }
+    },
+    "professional_title": "",
+    "summary": "",
+    "work_experience": [
+      {
+        "job_title": "",
+        "employer_name": "",
+        "employment_type": "",
+        "location_city": "",
+        "location_state": "",
+        "location_country": "",
+        "start_date (MM/DD/YY)": "",
+        "end_date (MM/DD/YY)": "",
+        "is_current_role": "",
+        "description": "",
+        "achievements": [
+          ""
+        ],
+        "technologies_used": [
+          ""
+        ]
+      }
+    ],
+    "education": [
+      {
+        "institution_name": "",
+        "degree": "",
+        "field_of_study": "",
+        "start_date (MM/DD/YY)": "",
+        "end_date (MM/DD/YY)": "",
+        "is_current_program": "",
+        "gpa": "",
+        "location_city": "",
+        "location_state": "",
+        "location_country": ""
+      }
+    ],
+    "skills": {
+      "technical_skills": [
+        ""
+      ],
+      "soft_skills": [
+        ""
+      ],
+      "tools_and_technologies": [
+        ""
+      ],
+      "languages": [
+        {
+          "language": "",
+          "proficiency": ""
+        }
+      ]
+    },
+    "certifications": [
+      {
+        "name": "",
+        "issuer": "",
+        "issue_date (MM/DD/YY)": "",
+        "expiration_date (MM/DD/YY)": "",
+        "credential_id": "",
+        "credential_url": ""
+      }
+    ],
+    "projects": [
+      {
+        "name": "",
+        "role": "",
+        "start_date (MM/DD/YY)": "",
+        "end_date (MM/DD/YY)": "",
+        "is_current_project": "",
+        "description": "",
+        "responsibilities": [
+          ""
+        ],
+        "technologies_used": [
+          ""
+        ],
+        "project_url": ""
+      }
+    ],
+    "publications": [
+      {
+        "title": "",
+        "venue": "",
+        "publication_date (MM/DD/YY)": "",
+        "authors": [
+          ""
+        ],
+        "doi": "",
+        "url": ""
+      }
+    ],
+    "awards": [
+      {
+        "name": "",
+        "issuer": "",
+        "date (MM/DD/YY)": "",
+        "description": ""
+      }
+    ],
+    "professional_memberships": [
+      {
+        "organization": "",
+        "role": "",
+        "start_date (MM/DD/YY)": "",
+        "end_date (MM/DD/YY)": ""
+      }
+    ],
+    "additional_information": {
+      "work_authorization": "",
+      "security_clearance": "",
+      "willing_to_relocate": "",
+      "willing_to_travel": "",
+      "other_notes": ""
+    }
+  }
+}

templates/school_transcripts.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "school_transcripts": {
+    "institution_information": {
+      "institution_name": "",
+      "school_or_college_name": "",
+      "campus": "",
+      "address": {
+        "street": "",
+        "apt_ste_flr": "",
+        "city": "",
+        "state": "",
+        "zip_code": "",
+        "country": ""
+      },
+      "telephone_number": "",
+      "email": "",
+      "website": ""
+    },
+    "student_information": {
+      "full_name": "",
+      "first_name": "",
+      "last_name": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "student_id": "",
+      "program_name": "",
+      "degree_type": "",
+      "major": "",
+      "minor": "",
+      "admission_date (MM/DD/YY)": "",
+      "graduation_date (MM/DD/YY)": "",
+      "is_current_student": ""
+    },
+    "transcript_details": {
+      "issue_date (MM/DD/YY)": "",
+      "has_signature": "",
+      "has_official_seal": "",
+      "is_official_transcript": "",
+      "grading_scale": "",
+      "overall_gpa": "",
+      "credits_attempted": "",
+      "credits_earned": ""
+    },
+    "remarks": {
+      "academic_standing": "",
+      "honors": "",
+      "warnings_or_probations": "",
+      "other_notes": ""
+    },
+    "signatory": {
+      "name": "",
+      "title": "",
+      "signature_date (MM/DD/YY)": ""
+    }
+  }
+}

templates/us_visa.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+"us_visa":
+{
+    "number": "",
+    "control_number": "",
+    "type": "",
+    "class": "",
+    "entries": "",
+    "issue_date (MM/DD/YY)": "",
+    "expiration_date (MM/DD/YY)": "",
+    "issuing_post": "",
+    "applicant_info": {
+      "surname": "",
+      "given_names": "",
+      "date_of_birth (MM/DD/YY)": "",
+      "nationality_country_name": "",
+      "sex (Fullform)": "",
+      "passport_number": ""
+    }
+}
+}