diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..43f6112e87451e24842dbba262b9c9472959c96f Binary files /dev/null and b/.DS_Store differ diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,35 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/API_KEY_QUICK_START.md b/API_KEY_QUICK_START.md new file mode 100644 index 0000000000000000000000000000000000000000..f6105de45080d333fdb909dad9bdd25174927dde --- /dev/null +++ b/API_KEY_QUICK_START.md @@ -0,0 +1,140 @@ +# API Key Authentication - Quick Start Guide + +## Summary + +API key authentication has been successfully implemented for external applications. The `/api/extract` endpoint now supports both JWT Bearer tokens and API keys. + +## Quick Steps to Use from External Applications + +### 1. Get an API Key + +**Option A: Via Web UI (if available)** +- Log in to your account +- Navigate to API Keys section +- Create a new API key +- Copy and store it securely + +**Option B: Via API** + +```bash +# Step 1: Authenticate and get JWT token +curl -X POST https://your-api-url/api/auth/otp/request \ + -H "Content-Type: application/json" \ + -d '{"email": "your-email@company.com"}' + +# Step 2: Verify OTP +curl -X POST https://your-api-url/api/auth/otp/verify \ + -H "Content-Type: application/json" \ + -d '{"email": "your-email@company.com", "otp": "123456"}' + +# Step 3: Create API key (use token from step 2) +curl -X POST https://your-api-url/api/auth/api-key/create \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"name": "My App"}' +``` + +**Response:** +```json +{ + "success": true, + "api_key": "sk_live_abc123...", // ⚠️ SAVE THIS! + "key_prefix": "sk_live_abc...", + "message": "API key created successfully. Store this key securely - it will not be shown again!" +} +``` + +### 2. Use API Key to Extract Documents + +```bash +curl -X POST https://your-api-url/api/extract \ + -H "X-API-Key: sk_live_abc123..." \ + -F "file=@document.pdf" \ + -F "key_fields=Invoice Number,Invoice Date,Total Amount" +``` + +## Authentication Methods + +The `/api/extract` endpoint accepts **either**: + +1. **API Key**: `X-API-Key: sk_live_...` header +2. **JWT Token**: `Authorization: Bearer ` header + +## New Endpoints + +- `POST /api/auth/api-key/create` - Create new API key (requires JWT) +- `GET /api/auth/api-keys` - List your API keys (requires JWT) +- `DELETE /api/auth/api-key/{key_id}` - Deactivate API key (requires JWT) + +## Security Features + +- ✅ API keys are hashed (SHA-256) before storage +- ✅ Only key prefix shown when listing keys +- ✅ Usage tracking (`last_used_at` timestamp) +- ✅ Soft delete (deactivation) support +- ✅ One key per user account + +## Example Code + +### Python +```python +import requests + +API_KEY = "sk_live_abc123..." +url = "https://your-api-url/api/extract" + +with open("document.pdf", "rb") as f: + response = requests.post( + url, + headers={"X-API-Key": API_KEY}, + files={"file": f}, + data={"key_fields": "Invoice Number,Invoice Date"} + ) + print(response.json()) +``` + +### JavaScript +```javascript +const FormData = require('form-data'); +const fs = require('fs'); +const axios = require('axios'); + +const form = new FormData(); +form.append('file', fs.createReadStream('document.pdf')); +form.append('key_fields', 'Invoice Number,Invoice Date'); + +axios.post('https://your-api-url/api/extract', form, { + headers: { + 'X-API-Key': 'sk_live_abc123...', + ...form.getHeaders() + } +}).then(response => console.log(response.data)); +``` + +## Full Documentation + +See `EXTERNAL_API_DOCUMENTATION.md` for complete documentation with: +- Detailed API reference +- Error handling +- Response formats +- Multiple language examples (Python, JavaScript, PHP) +- Best practices + +## Database Migration + +The new `api_keys` table will be created automatically when you restart the application (SQLAlchemy's `create_all` handles this). + +## Testing + +1. Start your backend server +2. Create an API key using the steps above +3. Test the extraction endpoint with the API key +4. Verify the response contains extracted data + +## Notes + +- API keys are shown **only once** when created - store them securely! +- Business email required for account creation +- Max file size: 4 MB +- Supported formats: PDF, PNG, JPEG, TIFF + diff --git a/Dockerfile b/Dockerfile index 982fc2f9c8c4f807905e69e03422d6ef6dc80046..1ef641026cc850d3b75aa971283bc2f45f5ccccc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,50 +1,83 @@ -# ---------- 1) Build frontend (React + Vite) ---------- -FROM node:20-alpine AS frontend-build -WORKDIR /frontend - -# Install frontend dependencies -COPY frontend/package*.json ./ -RUN npm install - -# Copy rest of frontend and build -COPY frontend/ . -RUN npm run build -# Vite will output to /frontend/dist by default - -# ---------- 2) Backend (FastAPI + Python) ---------- -FROM python:3.11-slim - -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 - -WORKDIR /app - -# System deps (optional but useful for some libs) -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - -# Install backend dependencies -COPY backend/requirements.txt ./backend/requirements.txt -RUN pip install --no-cache-dir -r backend/requirements.txt - -# Copy backend code -COPY backend ./backend - -# Copy built frontend into backend/frontend_dist -# FastAPI will serve from this folder later -RUN mkdir -p backend/frontend_dist -COPY --from=frontend-build /frontend/dist ./backend/frontend_dist - -# Create data directory for SQLite -RUN mkdir -p data - -# Env vars used in backend/db.py etc. -ENV DB_PATH=/app/data/app.db -ENV PORT=7860 -ENV PYTHONPATH=/app - -EXPOSE 7860 - -# Launch FastAPI app (we'll use backend.app.main:app) -CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "7860"] +# ---------- 1) Build frontend (React + Vite) ---------- +FROM node:20-alpine AS frontend-build +WORKDIR /frontend + +# Accept build arguments for Vite environment variables +ARG VITE_FIREBASE_API_KEY +ARG VITE_FIREBASE_AUTH_DOMAIN +ARG VITE_FIREBASE_PROJECT_ID +ARG VITE_FIREBASE_STORAGE_BUCKET +ARG VITE_FIREBASE_MESSAGING_SENDER_ID +ARG VITE_FIREBASE_APP_ID +ARG VITE_API_BASE_URL + +# Set as environment variables so they're available to the build script +ENV VITE_FIREBASE_API_KEY=$VITE_FIREBASE_API_KEY +ENV VITE_FIREBASE_AUTH_DOMAIN=$VITE_FIREBASE_AUTH_DOMAIN +ENV VITE_FIREBASE_PROJECT_ID=$VITE_FIREBASE_PROJECT_ID +ENV VITE_FIREBASE_STORAGE_BUCKET=$VITE_FIREBASE_STORAGE_BUCKET +ENV VITE_FIREBASE_MESSAGING_SENDER_ID=$VITE_FIREBASE_MESSAGING_SENDER_ID +ENV VITE_FIREBASE_APP_ID=$VITE_FIREBASE_APP_ID +ENV VITE_API_BASE_URL=$VITE_API_BASE_URL + +# Install frontend dependencies +COPY frontend/package*.json ./ +RUN npm install + +# Copy rest of frontend +COPY frontend/ . + +# Create .env file from environment variables and build +# Inline the script to avoid permission issues +RUN echo "Checking environment variables..." && \ + [ -z "$VITE_FIREBASE_API_KEY" ] && echo "WARNING: VITE_FIREBASE_API_KEY is not set" || echo "✓ VITE_FIREBASE_API_KEY is set" && \ + [ -z "$VITE_FIREBASE_AUTH_DOMAIN" ] && echo "WARNING: VITE_FIREBASE_AUTH_DOMAIN is not set" || echo "✓ VITE_FIREBASE_AUTH_DOMAIN is set" && \ + [ -z "$VITE_FIREBASE_PROJECT_ID" ] && echo "WARNING: VITE_FIREBASE_PROJECT_ID is not set" || echo "✓ VITE_FIREBASE_PROJECT_ID is set" && \ + echo "VITE_FIREBASE_API_KEY=${VITE_FIREBASE_API_KEY:-}" > .env && \ + echo "VITE_FIREBASE_AUTH_DOMAIN=${VITE_FIREBASE_AUTH_DOMAIN:-}" >> .env && \ + echo "VITE_FIREBASE_PROJECT_ID=${VITE_FIREBASE_PROJECT_ID:-}" >> .env && \ + echo "VITE_FIREBASE_STORAGE_BUCKET=${VITE_FIREBASE_STORAGE_BUCKET:-}" >> .env && \ + echo "VITE_FIREBASE_MESSAGING_SENDER_ID=${VITE_FIREBASE_MESSAGING_SENDER_ID:-}" >> .env && \ + echo "VITE_FIREBASE_APP_ID=${VITE_FIREBASE_APP_ID:-}" >> .env && \ + echo "VITE_API_BASE_URL=${VITE_API_BASE_URL:-}" >> .env && \ + echo "Created .env file with environment variables" && \ + npm run build +# Vite will output to /frontend/dist by default + +# ---------- 2) Backend (FastAPI + Python) ---------- +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +# System deps (optional but useful for some libs) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Install backend dependencies +COPY backend/requirements.txt ./backend/requirements.txt +RUN pip install --no-cache-dir -r backend/requirements.txt + +# Copy backend code +COPY backend ./backend + +# Copy built frontend into backend/frontend_dist +# FastAPI will serve from this folder later +RUN mkdir -p backend/frontend_dist +COPY --from=frontend-build /frontend/dist ./backend/frontend_dist + +# Create data directory for SQLite +RUN mkdir -p data + +# Env vars used in backend/db.py etc. +ENV DB_PATH=/app/data/app.db +ENV PORT=7860 +ENV PYTHONPATH=/app + +EXPOSE 7860 + +# Launch FastAPI app (we'll use backend.app.main:app) +CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/EXTERNAL_API_DOCUMENTATION.md b/EXTERNAL_API_DOCUMENTATION.md new file mode 100644 index 0000000000000000000000000000000000000000..d6659c81bcf5fc959aaf1c8b1c7a03ed36f8f37f --- /dev/null +++ b/EXTERNAL_API_DOCUMENTATION.md @@ -0,0 +1,458 @@ +# External API Documentation + +This document explains how to use the Document Parsing API from external applications using API key authentication. + +## Table of Contents +1. [Overview](#overview) +2. [Authentication](#authentication) +3. [API Endpoints](#api-endpoints) +4. [Usage Examples](#usage-examples) +5. [Response Format](#response-format) +6. [Error Handling](#error-handling) + +## Overview + +The Document Parsing API allows external applications to extract text and structured data from PDF and image files. The API supports: + +- **File Types**: PDF, PNG, JPEG, TIFF +- **Max File Size**: 4 MB +- **Authentication**: API Key (via `X-API-Key` header) or JWT Bearer token +- **Response Format**: JSON + +## Authentication + +### Step 1: Create an Account + +First, you need to create an account using one of these methods: + +1. **Firebase Authentication** (via web UI) +2. **OTP Authentication** (via API) + +#### OTP Authentication Flow + +```bash +# 1. Request OTP +curl -X POST https://your-api-url/api/auth/otp/request \ + -H "Content-Type: application/json" \ + -d '{ + "email": "your-business-email@company.com" + }' + +# Response: +# { +# "success": true, +# "message": "OTP sent to your email" +# } + +# 2. Verify OTP and get JWT token +curl -X POST https://your-api-url/api/auth/otp/verify \ + -H "Content-Type: application/json" \ + -d '{ + "email": "your-business-email@company.com", + "otp": "123456" + }' + +# Response: +# { +# "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", +# "user": { ... } +# } +``` + +**Note**: Only business email addresses are allowed (no Gmail, Yahoo, etc.) + +### Step 2: Create an API Key + +Once authenticated, create an API key for your external application: + +```bash +# Create API key (requires JWT token from Step 1) +curl -X POST https://your-api-url/api/auth/api-key/create \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "My External App" + }' + +# Response: +# { +# "success": true, +# "api_key": "sk_live_abc123...", # ⚠️ SAVE THIS - shown only once! +# "key_id": 1, +# "key_prefix": "sk_live_abc...", +# "name": "My External App", +# "created_at": "2024-01-15T10:30:00", +# "message": "API key created successfully. Store this key securely - it will not be shown again!" +# } +``` + +**⚠️ IMPORTANT**: The full API key is only shown once when created. Store it securely in your application's environment variables or secret management system. + +### Step 3: Use API Key for Authentication + +Use the API key in the `X-API-Key` header for all subsequent API calls: + +```bash +curl -X POST https://your-api-url/api/extract \ + -H "X-API-Key: sk_live_abc123..." \ + -F "file=@document.pdf" \ + -F "key_fields=Invoice Number,Invoice Date,Total Amount" +``` + +## API Endpoints + +### 1. Document Extraction + +**Endpoint**: `POST /api/extract` + +**Authentication**: +- API Key: `X-API-Key: ` +- OR JWT: `Authorization: Bearer ` + +**Parameters**: +- `file` (required): The document file (PDF, PNG, JPEG, TIFF) +- `key_fields` (optional): Comma-separated list of specific fields to extract + +**Example Request**: + +```bash +curl -X POST https://your-api-url/api/extract \ + -H "X-API-Key: sk_live_abc123..." \ + -F "file=@invoice.pdf" \ + -F "key_fields=Invoice Number,Invoice Date,Total Amount,PO Number" +``` + +**Example with cURL (file upload)**: + +```bash +curl -X POST https://your-api-url/api/extract \ + -H "X-API-Key: sk_live_abc123..." \ + -F "file=@/path/to/document.pdf" +``` + +### 2. List API Keys + +**Endpoint**: `GET /api/auth/api-keys` + +**Authentication**: JWT Bearer token (required) + +**Example**: + +```bash +curl -X GET https://your-api-url/api/auth/api-keys \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" +``` + +**Response**: + +```json +{ + "success": true, + "api_keys": [ + { + "id": 1, + "name": "My External App", + "key_prefix": "sk_live_abc...", + "is_active": true, + "last_used_at": "2024-01-15T14:30:00", + "created_at": "2024-01-15T10:30:00" + } + ] +} +``` + +### 3. Delete API Key + +**Endpoint**: `DELETE /api/auth/api-key/{key_id}` + +**Authentication**: JWT Bearer token (required) + +**Example**: + +```bash +curl -X DELETE https://your-api-url/api/auth/api-key/1 \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" +``` + +## Usage Examples + +### Python Example + +```python +import requests + +# API Configuration +API_BASE_URL = "https://your-api-url" +API_KEY = "sk_live_abc123..." # Your API key + +# Extract document +def extract_document(file_path, key_fields=None): + url = f"{API_BASE_URL}/api/extract" + headers = { + "X-API-Key": API_KEY + } + + with open(file_path, 'rb') as f: + files = {'file': f} + data = {} + if key_fields: + data['key_fields'] = key_fields + + response = requests.post(url, headers=headers, files=files, data=data) + response.raise_for_status() + return response.json() + +# Usage +result = extract_document("invoice.pdf", key_fields="Invoice Number,Invoice Date,Total Amount") +print(result) +``` + +### JavaScript/Node.js Example + +```javascript +const FormData = require('form-data'); +const fs = require('fs'); +const axios = require('axios'); + +// API Configuration +const API_BASE_URL = 'https://your-api-url'; +const API_KEY = 'sk_live_abc123...'; // Your API key + +// Extract document +async function extractDocument(filePath, keyFields = null) { + const form = new FormData(); + form.append('file', fs.createReadStream(filePath)); + if (keyFields) { + form.append('key_fields', keyFields); + } + + try { + const response = await axios.post(`${API_BASE_URL}/api/extract`, form, { + headers: { + 'X-API-Key': API_KEY, + ...form.getHeaders() + } + }); + return response.data; + } catch (error) { + console.error('Error:', error.response?.data || error.message); + throw error; + } +} + +// Usage +extractDocument('invoice.pdf', 'Invoice Number,Invoice Date,Total Amount') + .then(result => console.log(result)) + .catch(error => console.error(error)); +``` + +### PHP Example + +```php + new CURLFile($filePath) + ]; + + if ($keyFields) { + $postData['key_fields'] = $keyFields; + } + + curl_setopt_array($curl, [ + CURLOPT_URL => $url, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $postData, + CURLOPT_HTTPHEADER => [ + "X-API-Key: " . $apiKey + ] + ]); + + $response = curl_exec($curl); + $httpCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); + curl_close($curl); + + if ($httpCode !== 200) { + throw new Exception("API request failed: " . $response); + } + + return json_decode($response, true); +} + +// Usage +try { + $result = extractDocument("invoice.pdf", "Invoice Number,Invoice Date,Total Amount"); + print_r($result); +} catch (Exception $e) { + echo "Error: " . $e->getMessage(); +} +?> +``` + +## Response Format + +### Success Response + +```json +{ + "id": 123, + "fileName": "invoice.pdf", + "fileType": "application/pdf", + "fileSize": "2.5 MB", + "status": "completed", + "confidence": 92.5, + "fieldsExtracted": 15, + "totalTime": 3500, + "fields": { + "page_1": { + "text": "Extracted text from page 1...", + "table": { + "row_1": { + "column_1": "value1", + "column_2": "value2" + } + }, + "footer_notes": ["Note 1", "Note 2"] + } + }, + "full_text": "Complete extracted text from all pages...", + "Fields": { + "Invoice Number": "INV-001", + "Invoice Date": "2024-01-15", + "Total Amount": "$1,234.56" + }, + "stages": { + "uploading": { + "time": 525, + "status": "completed", + "variation": "normal" + }, + "aiAnalysis": { + "time": 1925, + "status": "completed", + "variation": "normal" + }, + "dataExtraction": { + "time": 700, + "status": "completed", + "variation": "fast" + }, + "outputRendering": { + "time": 350, + "status": "completed", + "variation": "normal" + } + }, + "errorMessage": null +} +``` + +### Response Fields + +- `id`: Extraction record ID +- `fileName`: Original filename +- `fileType`: MIME type of the file +- `fileSize`: File size as string +- `status`: "completed" or "failed" +- `confidence`: Extraction confidence (0-100) +- `fieldsExtracted`: Number of fields extracted +- `totalTime`: Total processing time in milliseconds +- `fields`: Structured data with page-wise extraction (tables, text, metadata) +- `full_text`: Complete extracted text from all pages +- `Fields`: User-specified fields extracted (if `key_fields` parameter was provided) +- `stages`: Processing stage timings +- `errorMessage`: Error message if extraction failed + +## Error Handling + +### Authentication Errors + +**401 Unauthorized** - Invalid or missing API key: + +```json +{ + "detail": "Invalid API key" +} +``` + +**401 Unauthorized** - No authentication provided: + +```json +{ + "detail": "Authentication required. Provide either a Bearer token or X-API-Key header." +} +``` + +### Validation Errors + +**400 Bad Request** - File too large: + +```json +{ + "detail": "File size exceeds 4 MB limit. Your file is 5.2 MB." +} +``` + +**400 Bad Request** - Invalid file type: + +```json +{ + "detail": "Only PDF, PNG, JPG, and TIFF files are allowed." +} +``` + +### Processing Errors + +**500 Internal Server Error** - Extraction failed: + +```json +{ + "id": 123, + "status": "failed", + "confidence": 0.0, + "fieldsExtracted": 0, + "errorMessage": "OCR processing failed: ..." +} +``` + +## Best Practices + +1. **Store API Keys Securely**: Never commit API keys to version control. Use environment variables or secret management systems. + +2. **Handle Errors Gracefully**: Always check the `status` field in the response. If `status` is "failed", check `errorMessage` for details. + +3. **Respect Rate Limits**: If rate limiting is implemented, handle 429 responses appropriately with exponential backoff. + +4. **Validate File Types**: Check file type and size before uploading to avoid unnecessary API calls. + +5. **Use Specific Fields**: When you know what fields to extract, use the `key_fields` parameter for better accuracy and faster processing. + +6. **Monitor API Key Usage**: Regularly check your API keys via the `/api/auth/api-keys` endpoint to monitor usage and detect unauthorized access. + +## Security Notes + +- API keys are hashed before storage in the database +- Only the key prefix is shown when listing API keys +- API keys can be deactivated (soft deleted) but not permanently deleted +- Each API key is tied to a specific user account +- API key usage is tracked with `last_used_at` timestamp + +## Support + +For issues or questions: +1. Check the error message in the API response +2. Verify your API key is active and correct +3. Ensure your file meets the requirements (type, size) +4. Check the API status endpoint: `GET /ping` + diff --git a/FIREBASE_OTP_SETUP.md b/FIREBASE_OTP_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..4ec6af03cded08a8321d39460731a79516a7446a --- /dev/null +++ b/FIREBASE_OTP_SETUP.md @@ -0,0 +1,296 @@ +# Firebase Authentication + OTP Setup Guide + +This application uses Firebase Authentication for Google sign-in and Brevo for OTP email delivery. Only business email addresses are allowed. + +## Prerequisites + +1. Firebase project +2. Brevo account (for sending OTP emails) +3. Business email domain verification + +--- + +## Step 1: Firebase Setup + +### 1.1 Create Firebase Project + +1. Go to [Firebase Console](https://console.firebase.google.com/) +2. Click "Add project" or select an existing project +3. Follow the setup wizard + +### 1.2 Enable Google Authentication + +1. In Firebase Console, go to **Authentication** → **Sign-in method** +2. Click on **Google** provider +3. Enable it and set your project support email +4. Save the changes + +### 1.3 Get Firebase Web App Configuration + +1. In Firebase Console, go to **Project Settings** (gear icon) +2. Scroll down to "Your apps" section +3. Click the **Web** icon (``) to add a web app +4. Register your app (you can skip Firebase Hosting for now) +5. Copy the Firebase configuration object + +### 1.4 Get Firebase Service Account Key + +1. In Firebase Console, go to **Project Settings** → **Service accounts** +2. Click **Generate new private key** +3. Download the JSON file (keep it secure!) + +### 1.5 Set Frontend Environment Variables + +Create or update `frontend/.env`: + +```bash +VITE_FIREBASE_API_KEY=your-api-key +VITE_FIREBASE_AUTH_DOMAIN=your-project.firebaseapp.com +VITE_FIREBASE_PROJECT_ID=your-project-id +VITE_FIREBASE_STORAGE_BUCKET=your-project.appspot.com +VITE_FIREBASE_MESSAGING_SENDER_ID=your-sender-id +VITE_FIREBASE_APP_ID=your-app-id +``` + +### 1.6 Set Backend Environment Variables + +You have two options for Firebase Admin SDK: + +**Option A: Service Account JSON File** +```bash +FIREBASE_SERVICE_ACCOUNT_KEY=/path/to/service-account-key.json +``` + +**Option B: Service Account JSON String (Recommended for Docker/Cloud/Hugging Face Spaces)** +```bash +FIREBASE_SERVICE_ACCOUNT_JSON='{"type":"service_account","project_id":"...","private_key_id":"...","private_key":"...","client_email":"...","client_id":"...","auth_uri":"...","token_uri":"...","auth_provider_x509_cert_url":"...","client_x509_cert_url":"..."}' +``` + +**For Hugging Face Spaces:** +- Use **Option B** (JSON String) as a **Secret** (Private) +- Copy the entire contents of your service account JSON file +- Paste it as the value for `FIREBASE_SERVICE_ACCOUNT_JSON` +- Make sure to keep the single quotes around the JSON if setting via command line, or just paste the raw JSON in the Spaces UI + +--- + +## Step 2: Brevo Setup + +### 2.1 Create Brevo Account + +1. Go to [Brevo](https://www.brevo.com/) (formerly Sendinblue) +2. Sign up for a free account (300 emails/day free tier) +3. Verify your email address + +### 2.2 Get API Key + +1. Log in to Brevo +2. Go to **Settings** → **API Keys** +3. Click **Generate a new API key** +4. Copy the API key (starts with `xkeysib-...`) + +### 2.3 Verify Sender Email + +1. Go to **Senders & IP** → **Senders** +2. Click **Add a sender** +3. Enter your sender email (e.g., `noreply@yourdomain.com`) +4. Verify the email address (check your inbox for verification email) +5. Once verified, you can use it to send emails + +### 2.4 Set Backend Environment Variables + +```bash +BREVO_API_KEY=xkeysib-your-api-key-here +BREVO_SENDER_EMAIL=noreply@yourdomain.com +BREVO_SENDER_NAME=EZOFIS AI +``` + +--- + +## Step 3: JWT Secret Key + +Generate a strong random string for JWT token signing: + +```bash +# Generate a random secret (Linux/Mac) +openssl rand -hex 32 + +# Or use Python +python -c "import secrets; print(secrets.token_hex(32))" +``` + +Set the environment variable: + +```bash +JWT_SECRET_KEY=your-generated-secret-key-here +``` + +--- + +## Step 4: Frontend URL + +Set the frontend URL for OAuth redirects: + +```bash +FRONTEND_URL=http://localhost:5173 # Development +# OR +FRONTEND_URL=https://your-domain.com # Production +``` + +--- + +## Step 5: Install Dependencies + +### Backend + +```bash +cd backend +pip install -r requirements.txt +``` + +### Frontend + +```bash +cd frontend +npm install +``` + +--- + +## Step 6: Database Migration + +The database will automatically create the new schema when you start the application. However, if you have existing data: + +**Option 1: Fresh Start (Recommended for Development)** +- Delete the existing database file: `data/app.db` +- Restart the application (tables will be recreated) + +**Option 2: Manual Migration (For Production)** +- The new `users` table will be created automatically +- Existing `extractions` table needs `user_id` column added +- You'll need to assign existing records to a default user or migrate them + +--- + +## Step 7: Test the Setup + +### 7.1 Test Firebase Authentication + +1. Start the backend server +2. Start the frontend development server +3. Navigate to the application +4. Click "Google Sign In" +5. Sign in with a business Google account +6. Verify you're redirected to the dashboard + +### 7.2 Test OTP Authentication + +1. Click on "Email / OTP" tab +2. Enter a business email address +3. Click "Send OTP" +4. Check your email for the OTP code +5. Enter the OTP and verify +6. Verify you're redirected to the dashboard + +### 7.3 Test Business Email Validation + +1. Try to sign in with a personal Gmail account +2. Verify you get an error message +3. Try OTP with a personal email +4. Verify it's blocked + +--- + +## Environment Variables Summary + +### Backend (.env or environment) + +```bash +# Firebase +FIREBASE_SERVICE_ACCOUNT_JSON='{...}' # OR +FIREBASE_SERVICE_ACCOUNT_KEY=/path/to/key.json + +# Brevo +BREVO_API_KEY=xkeysib-... +BREVO_SENDER_EMAIL=noreply@yourdomain.com +BREVO_SENDER_NAME=EZOFIS AI + +# JWT +JWT_SECRET_KEY=your-secret-key + +# Frontend URL +FRONTEND_URL=http://localhost:5173 # For local development +# For Hugging Face Spaces: https://your-username-ezofisocr.hf.space +``` + +**For Hugging Face Spaces:** +- Set `FIREBASE_SERVICE_ACCOUNT_JSON`, `BREVO_API_KEY`, and `JWT_SECRET_KEY` as **Secrets (Private)** +- Set `BREVO_SENDER_EMAIL`, `BREVO_SENDER_NAME`, and `FRONTEND_URL` as **Variables (Public)** +- See `HUGGINGFACE_SPACES_SETUP.md` for detailed instructions + +### Frontend (.env) + +```bash +VITE_FIREBASE_API_KEY=... +VITE_FIREBASE_AUTH_DOMAIN=... +VITE_FIREBASE_PROJECT_ID=... +VITE_FIREBASE_STORAGE_BUCKET=... +VITE_FIREBASE_MESSAGING_SENDER_ID=... +VITE_FIREBASE_APP_ID=... +VITE_API_BASE_URL=http://localhost:7860 +``` + +--- + +## Troubleshooting + +### Firebase Issues + +- **"Firebase not configured"**: Check that `FIREBASE_SERVICE_ACCOUNT_JSON` or `FIREBASE_SERVICE_ACCOUNT_KEY` is set correctly +- **"Invalid Firebase token"**: Ensure Firebase Web SDK is properly configured in frontend +- **"Email not found"**: Make sure Google sign-in is enabled in Firebase Console + +### Brevo Issues + +- **"Failed to send email"**: + - Verify your API key is correct + - Check that sender email is verified in Brevo + - Ensure you haven't exceeded the free tier limit (300 emails/day) +- **"API key not set"**: Check that `BREVO_API_KEY` environment variable is set + +### Business Email Validation + +- Personal emails (Gmail, Yahoo, etc.) are automatically blocked +- Only business/corporate email domains are allowed +- The validation happens on both frontend and backend + +--- + +## Security Notes + +1. **Never commit** Firebase service account keys or API keys to version control +2. Use environment variables or secure secret management +3. JWT tokens expire after 7 days +4. OTP codes expire after 10 minutes +5. Maximum 5 OTP verification attempts per email +6. All extraction records are filtered by user_id for data isolation + +--- + +## Production Deployment + +1. Set all environment variables in your hosting platform +2. Use HTTPS for both frontend and backend +3. Update `FRONTEND_URL` to your production domain +4. Verify sender email in Brevo with your production domain +5. Consider using Redis for OTP storage instead of in-memory (for scalability) +6. Set up proper error monitoring and logging + +--- + +## Support + +For issues: +- Firebase: [Firebase Documentation](https://firebase.google.com/docs) +- Brevo: [Brevo API Documentation](https://developers.brevo.com/) + diff --git a/GOOGLE_OAUTH_SETUP.md b/GOOGLE_OAUTH_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..ec9cd0dbb840891fe2d7e90837d32fbc7cac8e81 --- /dev/null +++ b/GOOGLE_OAUTH_SETUP.md @@ -0,0 +1,79 @@ +# Google OAuth Setup Guide + +This application uses Google OAuth for user authentication. Follow these steps to set it up: + +## 1. Create Google OAuth Credentials + +1. Go to the [Google Cloud Console](https://console.cloud.google.com/) +2. Create a new project or select an existing one +3. Enable the Google+ API +4. Go to "Credentials" → "Create Credentials" → "OAuth client ID" +5. Choose "Web application" +6. Add authorized redirect URIs: + - For development: `http://localhost:7860/api/auth/callback` + - For production: `https://your-domain.com/api/auth/callback` +7. Copy the Client ID and Client Secret + +## 2. Set Environment Variables + +Set the following environment variables: + +```bash +# Google OAuth +GOOGLE_CLIENT_ID=your-client-id-here +GOOGLE_CLIENT_SECRET=your-client-secret-here + +# JWT Secret (use a strong random string) +JWT_SECRET_KEY=your-secret-key-here + +# Frontend URL (for OAuth redirect) +FRONTEND_URL=http://localhost:5173 # or your production URL +``` + +## 3. Database Migration + +The database will automatically create the new `users` table and add `user_id` to the `extractions` table when you start the application. + +**Note:** If you have an existing database with extraction records, you'll need to: +1. Back up your data +2. Delete the old database file +3. Restart the application to recreate tables with the new schema + +Or manually migrate: +- Add `user_id` column to `extractions` table (you may need to set a default user_id for existing records) + +## 4. Install Dependencies + +Make sure to install the new Python dependencies: + +```bash +pip install -r backend/requirements.txt +``` + +New dependencies added: +- `authlib` - OAuth library +- `pyjwt` - JWT token handling +- `python-jose[cryptography]` - JWT verification + +## 5. Start the Application + +1. Start the backend server +2. Start the frontend development server +3. Users will be prompted to sign in with Google when they try to access the application + +## How It Works + +1. User clicks "Sign in with Google" → redirected to Google login +2. After authentication, Google redirects to `/api/auth/callback` +3. Backend creates/updates user in database and generates JWT token +4. Frontend receives token and stores it in localStorage +5. All API requests include the JWT token in the Authorization header +6. Backend verifies token and filters data by user_id + +## Security Notes + +- JWT tokens expire after 7 days +- Tokens are stored in localStorage (consider httpOnly cookies for production) +- All extraction records are filtered by user_id +- Users can only see their own data and history + diff --git a/HUGGINGFACE_SPACES_SETUP.md b/HUGGINGFACE_SPACES_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..1f56f371824b862fd3f234a0ea2edac4947bbef2 --- /dev/null +++ b/HUGGINGFACE_SPACES_SETUP.md @@ -0,0 +1,186 @@ +# Hugging Face Spaces Setup Guide + +This guide provides specific instructions for deploying the EZOFIS OCR application to Hugging Face Spaces. + +## Prerequisites + +1. ✅ Firebase project configured +2. ✅ Brevo account set up +3. ✅ Hugging Face account with a Space created + +--- + +## Step 1: Frontend Environment Variables + +Set these in **Hugging Face Spaces → Settings → Variables and secrets**: + +### Variables (Public): +- `VITE_FIREBASE_API_KEY` → Set as **Secret (Private)** ✅ +- `VITE_FIREBASE_AUTH_DOMAIN` → Set as **Variable (Public)** +- `VITE_FIREBASE_PROJECT_ID` → Set as **Variable (Public)** +- `VITE_FIREBASE_STORAGE_BUCKET` → Set as **Variable (Public)** +- `VITE_FIREBASE_MESSAGING_SENDER_ID` → Set as **Variable (Public)** +- `VITE_FIREBASE_APP_ID` → Set as **Variable (Public)** +- `VITE_API_BASE_URL` → Set as **Variable (Public)** (e.g., `https://your-username-ezofisocr.hf.space`) + +**Note:** These variables are used during the Docker build process. The Dockerfile has been updated to accept them as build arguments. + +--- + +## Step 2: Backend Environment Variables + +Set these in **Hugging Face Spaces → Settings → Variables and secrets**: + +### Secrets (Private): +1. **`FIREBASE_SERVICE_ACCOUNT_JSON`** + - Get your Firebase service account JSON file (from Firebase Console → Project Settings → Service accounts) + - Copy the **entire JSON content** (all in one line or formatted) + - Paste it as the value for this secret + - Example format: + ```json + {"type":"service_account","project_id":"your-project","private_key_id":"...","private_key":"-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n","client_email":"...","client_id":"...","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_x509_cert_url":"..."} + ``` + +2. **`BREVO_API_KEY`** + - Get from Brevo → Settings → API Keys + - Format: `xkeysib-...` + +3. **`JWT_SECRET_KEY`** + - Generate a secure random key: + ```bash + openssl rand -hex 32 + ``` + - Or use Python: + ```bash + python -c "import secrets; print(secrets.token_hex(32))" + ``` + +### Variables (Public): +1. **`BREVO_SENDER_EMAIL`** + - Your verified sender email in Brevo + - Example: `noreply@yourdomain.com` + +2. **`BREVO_SENDER_NAME`** + - Display name for emails + - Example: `EZOFIS AI` + +3. **`FRONTEND_URL`** + - Your Hugging Face Space URL + - Format: `https://your-username-ezofisocr.hf.space` + - Replace `your-username` with your actual Hugging Face username + +--- + +## Step 3: Verify Dockerfile + +The Dockerfile has been updated to accept frontend environment variables as build arguments. Make sure your `Dockerfile` includes: + +```dockerfile +# Accept build arguments for Vite environment variables +ARG VITE_FIREBASE_API_KEY +ARG VITE_FIREBASE_AUTH_DOMAIN +ARG VITE_FIREBASE_PROJECT_ID +ARG VITE_FIREBASE_STORAGE_BUCKET +ARG VITE_FIREBASE_MESSAGING_SENDER_ID +ARG VITE_FIREBASE_APP_ID +ARG VITE_API_BASE_URL + +# Set as environment variables so Vite can access them during build +ENV VITE_FIREBASE_API_KEY=$VITE_FIREBASE_API_KEY +ENV VITE_FIREBASE_AUTH_DOMAIN=$VITE_FIREBASE_AUTH_DOMAIN +ENV VITE_FIREBASE_PROJECT_ID=$VITE_FIREBASE_PROJECT_ID +ENV VITE_FIREBASE_STORAGE_BUCKET=$VITE_FIREBASE_STORAGE_BUCKET +ENV VITE_FIREBASE_MESSAGING_SENDER_ID=$VITE_FIREBASE_MESSAGING_SENDER_ID +ENV VITE_FIREBASE_APP_ID=$VITE_FIREBASE_APP_ID +ENV VITE_API_BASE_URL=$VITE_API_BASE_URL +``` + +--- + +## Step 4: Deploy + +1. **Commit and push** your code to the Hugging Face Space repository + - Make sure `frontend/build-env.sh` is included in your commit +2. **Wait for the build** to complete (check the "Logs" tab) + - Look for "Checking environment variables..." messages in the build logs + - Verify all variables show "✓ ... is set" (not "WARNING: ... is not set") +3. **Test the deployment**: + - Open your Space URL + - Try Firebase login + - Try OTP authentication + +**Important:** After setting or updating environment variables in Hugging Face Spaces, you need to **rebuild** the Space for the changes to take effect. The frontend is built during the Docker build process, so environment variable changes require a rebuild. + +--- + +## Troubleshooting + +### Build Fails with "VITE_* variables not found" +- **Solution:** Make sure all `VITE_*` variables are set in Spaces → Variables and secrets +- Hugging Face Spaces automatically makes environment variables available during build +- The Dockerfile uses a build script to create a `.env` file from these variables + +### Firebase Authentication Not Working - "auth/invalid-api-key" Error +- **Check:** `VITE_FIREBASE_API_KEY` is set correctly (as a Secret) and contains the full API key +- **Check:** All other `VITE_FIREBASE_*` variables are set with correct values +- **Check:** After updating variables, rebuild the Space (the frontend needs to be rebuilt) +- **Check:** Firebase Console → Authentication → Sign-in method → Google is enabled +- **Check:** The API key matches the one in Firebase Console → Project Settings → Your apps +- **Solution:** If the error persists, check the build logs to see if the `.env` file is being created correctly + +### OTP Emails Not Sending +- **Check:** `BREVO_API_KEY` is set correctly (as a Secret) +- **Check:** `BREVO_SENDER_EMAIL` is verified in Brevo +- **Check:** `BREVO_SENDER_NAME` is set +- **Check:** You haven't exceeded Brevo free tier (300 emails/day) + +### Backend Errors +- **Check:** `FIREBASE_SERVICE_ACCOUNT_JSON` contains the full JSON (all fields) +- **Check:** `JWT_SECRET_KEY` is set +- **Check:** `FRONTEND_URL` matches your Space URL exactly + +--- + +## Environment Variables Checklist + +### Frontend (Build-time): +- [ ] `VITE_FIREBASE_API_KEY` (Secret) +- [ ] `VITE_FIREBASE_AUTH_DOMAIN` (Variable) +- [ ] `VITE_FIREBASE_PROJECT_ID` (Variable) +- [ ] `VITE_FIREBASE_STORAGE_BUCKET` (Variable) +- [ ] `VITE_FIREBASE_MESSAGING_SENDER_ID` (Variable) +- [ ] `VITE_FIREBASE_APP_ID` (Variable) +- [ ] `VITE_API_BASE_URL` (Variable) + +### Backend (Runtime): +- [ ] `FIREBASE_SERVICE_ACCOUNT_JSON` (Secret) +- [ ] `BREVO_API_KEY` (Secret) +- [ ] `JWT_SECRET_KEY` (Secret) +- [ ] `BREVO_SENDER_EMAIL` (Variable) +- [ ] `BREVO_SENDER_NAME` (Variable) +- [ ] `FRONTEND_URL` (Variable) + +--- + +## Notes + +1. **Build vs Runtime:** Frontend variables (`VITE_*`) are used during Docker build, backend variables are used at runtime. + +2. **Secrets vs Variables:** + - Use **Secrets** for sensitive data (API keys, private keys, JWT secrets) + - Use **Variables** for non-sensitive configuration (URLs, display names) + +3. **Firebase Service Account JSON:** When copying the JSON, make sure to include the entire content, including the `private_key` field with newlines preserved (they should be `\n` in the JSON string). + +4. **Space URL:** Your Space URL format is `https://{username}-{space-name}.hf.space`. Make sure `FRONTEND_URL` and `VITE_API_BASE_URL` match this exactly. + +--- + +## Support + +If you encounter issues: +1. Check the build logs in Hugging Face Spaces +2. Verify all environment variables are set correctly +3. Ensure Firebase and Brevo are properly configured +4. Review the main setup guide: `FIREBASE_OTP_SETUP.md` + diff --git a/IMPLEMENTATION_COMPLETE.md b/IMPLEMENTATION_COMPLETE.md new file mode 100644 index 0000000000000000000000000000000000000000..63ca34258b25318d459e87775f7237f7d4bd1e15 --- /dev/null +++ b/IMPLEMENTATION_COMPLETE.md @@ -0,0 +1,257 @@ +# ✅ Firebase + OTP Authentication Implementation Complete + +All code changes have been applied successfully! Here are the next steps you need to follow: + +## 📋 Implementation Summary + +### ✅ Backend Changes (Completed) +- ✅ Updated `requirements.txt` with Firebase Admin SDK +- ✅ Updated `models.py` - User model now supports Firebase and OTP auth methods +- ✅ Created `email_validator.py` - Business email validation +- ✅ Created `firebase_auth.py` - Firebase token verification +- ✅ Created `brevo_service.py` - Brevo email service for OTP +- ✅ Created `otp_service.py` - OTP generation and verification +- ✅ Updated `auth_routes.py` - New endpoints for Firebase and OTP login + +### ✅ Frontend Changes (Completed) +- ✅ Updated `package.json` with Firebase SDK +- ✅ Created `config/firebase.js` - Firebase configuration +- ✅ Updated `services/auth.js` - Firebase and OTP auth functions +- ✅ Updated `contexts/AuthContext.jsx` - Firebase and OTP support +- ✅ Created `components/auth/LoginForm.jsx` - Login UI with both options +- ✅ Updated `App.jsx` - Integrated LoginForm component + +--- + +## 🚀 Next Steps (YOU NEED TO DO THESE) + +### Step 1: Install Dependencies + +**Backend:** +```bash +cd backend +pip install -r requirements.txt +``` + +**Frontend:** +```bash +cd frontend +npm install +``` + +--- + +### Step 2: Set Up Firebase + +1. **Create Firebase Project:** + - Go to https://console.firebase.google.com/ + - Create a new project or use existing one + +2. **Enable Google Authentication:** + - In Firebase Console → Authentication → Sign-in method + - Enable "Google" provider + - Set project support email + +3. **Get Web App Config:** + - Project Settings → Your apps → Add Web app + - Copy the config values + +4. **Get Service Account Key:** + - Project Settings → Service accounts + - Click "Generate new private key" + - Download the JSON file + +5. **Set Frontend Environment Variables:** + Create `frontend/.env`: + ```bash + VITE_FIREBASE_API_KEY=your-api-key-here + VITE_FIREBASE_AUTH_DOMAIN=your-project.firebaseapp.com + VITE_FIREBASE_PROJECT_ID=your-project-id + VITE_FIREBASE_STORAGE_BUCKET=your-project.appspot.com + VITE_FIREBASE_MESSAGING_SENDER_ID=your-sender-id + VITE_FIREBASE_APP_ID=your-app-id + VITE_API_BASE_URL=http://localhost:7860 + ``` + +6. **Set Backend Environment Variables:** + Option A (JSON file path): + ```bash + FIREBASE_SERVICE_ACCOUNT_KEY=/path/to/service-account-key.json + ``` + + Option B (JSON string - recommended for Docker): + ```bash + FIREBASE_SERVICE_ACCOUNT_JSON='{"type":"service_account","project_id":"...","private_key":"...","client_email":"..."}' + ``` + (Copy the entire JSON content from the downloaded file) + +--- + +### Step 3: Set Up Brevo + +1. **Create Brevo Account:** + - Go to https://www.brevo.com/ + - Sign up (free tier: 300 emails/day) + +2. **Get API Key:** + - Settings → API Keys + - Generate new API key + - Copy the key (starts with `xkeysib-`) + +3. **Verify Sender Email:** + - Senders & IP → Senders + - Add sender email (e.g., `noreply@yourdomain.com`) + - Verify via email + +4. **Set Backend Environment Variables:** + ```bash + BREVO_API_KEY=xkeysib-your-api-key-here + BREVO_SENDER_EMAIL=noreply@yourdomain.com + BREVO_SENDER_NAME=EZOFIS AI + ``` + +--- + +### Step 4: Set JWT Secret + +Generate a secure random key: +```bash +# Linux/Mac +openssl rand -hex 32 + +# Or Python +python -c "import secrets; print(secrets.token_hex(32))" +``` + +Set environment variable: +```bash +JWT_SECRET_KEY=your-generated-secret-key-here +``` + +--- + +### Step 5: Set Frontend URL + +```bash +FRONTEND_URL=http://localhost:5173 # Development +# OR +FRONTEND_URL=https://your-domain.com # Production +``` + +--- + +### Step 6: Database Migration + +**If you have existing data:** +- The new schema will be created automatically +- Existing `extractions` table needs `user_id` column +- You may need to assign existing records to a default user + +**For fresh start (recommended for development):** +- Delete `data/app.db` (if exists) +- Restart application - tables will be recreated + +--- + +### Step 7: Test the Implementation + +1. **Start Backend:** + ```bash + cd backend + uvicorn app.main:app --reload --port 7860 + ``` + +2. **Start Frontend:** + ```bash + cd frontend + npm run dev + ``` + +3. **Test Firebase Login:** + - Navigate to http://localhost:5173 + - Click "Google Sign In" tab + - Sign in with business Google account + - Should redirect to dashboard + +4. **Test OTP Login:** + - Click "Email / OTP" tab + - Enter business email + - Click "Send OTP" + - Check email for OTP code + - Enter OTP and verify + - Should redirect to dashboard + +5. **Test Business Email Validation:** + - Try personal Gmail account → Should be blocked + - Try OTP with personal email → Should be blocked + +--- + +## 📝 Environment Variables Checklist + +### Backend (.env or system environment) +- [ ] `FIREBASE_SERVICE_ACCOUNT_JSON` or `FIREBASE_SERVICE_ACCOUNT_KEY` +- [ ] `BREVO_API_KEY` +- [ ] `BREVO_SENDER_EMAIL` +- [ ] `BREVO_SENDER_NAME` +- [ ] `JWT_SECRET_KEY` +- [ ] `FRONTEND_URL` + +### Frontend (.env) +- [ ] `VITE_FIREBASE_API_KEY` +- [ ] `VITE_FIREBASE_AUTH_DOMAIN` +- [ ] `VITE_FIREBASE_PROJECT_ID` +- [ ] `VITE_FIREBASE_STORAGE_BUCKET` +- [ ] `VITE_FIREBASE_MESSAGING_SENDER_ID` +- [ ] `VITE_FIREBASE_APP_ID` +- [ ] `VITE_API_BASE_URL` + +--- + +## 🔒 Security Reminders + +1. ✅ Never commit API keys or secrets to git +2. ✅ Use `.env` files (add to `.gitignore`) +3. ✅ Business email validation is enforced on both frontend and backend +4. ✅ JWT tokens expire after 7 days +5. ✅ OTP codes expire after 10 minutes +6. ✅ Maximum 5 OTP verification attempts + +--- + +## 📚 Documentation + +- **Firebase Setup:** See `FIREBASE_OTP_SETUP.md` for detailed instructions +- **Brevo API:** https://developers.brevo.com/reference/sendtransacemail + +--- + +## ⚠️ Important Notes + +1. **Database Schema Change:** + - User model changed from `google_id` (required) to `firebase_uid` (optional) + - If you have existing users, you'll need to migrate the data + - For development, deleting `data/app.db` is the easiest option + +2. **Business Email Validation:** + - Personal email domains are blocked (Gmail, Yahoo, Outlook, etc.) + - Validation happens on both frontend and backend + - Users must use their work/corporate email addresses + +3. **OTP Storage:** + - Currently stored in memory (works for single server) + - For production with multiple servers, consider using Redis + +--- + +## 🎉 You're All Set! + +Once you complete the setup steps above, your application will have: +- ✅ Firebase Google Sign-in (no OAuth credentials needed!) +- ✅ Email/OTP authentication via Brevo +- ✅ Business email validation +- ✅ User-specific data isolation +- ✅ Secure JWT token authentication + +Good luck! 🚀 + diff --git a/README.md b/README.md index 83b7b62b26fd1651962a88fec11a6d9f7b0af1ba..052854aab2a6894ac23e68809c0d4bb4fa25f228 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ ---- -title: EZOFISAIOCR -emoji: 🌍 -colorFrom: blue -colorTo: purple -sdk: docker -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +--- +title: EZOFISAIOCR +emoji: 🌍 +colorFrom: blue +colorTo: purple +sdk: docker +pinned: false +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/backend/app/api_key_auth.py b/backend/app/api_key_auth.py new file mode 100644 index 0000000000000000000000000000000000000000..503e05e41120bd1372c1d571429281b7ca050da5 --- /dev/null +++ b/backend/app/api_key_auth.py @@ -0,0 +1,100 @@ +import os +import secrets +import hashlib +from datetime import datetime +from typing import Optional +from fastapi import Depends, HTTPException, status, Header +from sqlalchemy.orm import Session +from .db import SessionLocal +from .models import APIKey, User + + +def get_db(): + """Database dependency.""" + db = SessionLocal() + try: + yield db + finally: + db.close() + + +def generate_api_key() -> str: + """ + Generate a secure API key. + Format: sk_live_ + """ + random_bytes = secrets.token_bytes(32) + random_hex = random_bytes.hex() + return f"sk_live_{random_hex}" + + +def hash_api_key(key: str) -> str: + """Hash an API key using SHA-256.""" + return hashlib.sha256(key.encode()).hexdigest() + + +def verify_api_key(key: str, key_hash: str) -> bool: + """Verify an API key against its hash.""" + return hash_api_key(key) == key_hash + + +def get_api_key_prefix(key: str) -> str: + """Get the prefix of an API key for display purposes.""" + return key[:12] + "..." if len(key) > 12 else key + + +async def get_user_from_api_key( + api_key: Optional[str] = Header(None, alias="X-API-Key"), + db: Session = Depends(get_db) +) -> Optional[User]: + """ + Authenticate user from API key header. + Returns User if valid, None if no API key provided. + Raises HTTPException if API key is invalid. + """ + if not api_key: + return None + + # Hash the provided key + key_hash = hash_api_key(api_key) + + # Find the API key in database + api_key_record = ( + db.query(APIKey) + .filter(APIKey.key_hash == key_hash) + .filter(APIKey.is_active == True) + .first() + ) + + if not api_key_record: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + headers={"WWW-Authenticate": "Bearer"}, + ) + + # Update last used timestamp + api_key_record.last_used_at = datetime.utcnow() + db.commit() + + # Get the user + user = db.query(User).filter(User.id == api_key_record.user_id).first() + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="User not found for API key", + ) + + return user + + +async def get_current_user_or_api_key( + api_key_user: Optional[User] = Depends(get_user_from_api_key), + # JWT auth will be handled separately in main.py +) -> Optional[User]: + """ + Dependency that returns user from API key if provided, otherwise None. + This allows endpoints to support both JWT and API key authentication. + """ + return api_key_user + diff --git a/backend/app/apollo_service.py b/backend/app/apollo_service.py new file mode 100644 index 0000000000000000000000000000000000000000..000a263d66178cf1c034dbd4236d81f82595045d --- /dev/null +++ b/backend/app/apollo_service.py @@ -0,0 +1,444 @@ +""" +Apollo.io API service for creating contacts, enriching contact data, and adding them to sequences. +Reference: +- Create contact: https://docs.apollo.io/reference/create-a-contact +- Add to sequence: https://docs.apollo.io/reference/add-contacts-to-sequence +- Enrich person: https://docs.apollo.io/reference/enrich-people-data +""" +import os +import httpx +from typing import Optional, Dict, Any + +APOLLO_API_KEY = os.environ.get("APOLLO_API_KEY", "") +APOLLO_API_URL = "https://api.apollo.io/api/v1" +APOLLO_TRIAL_LIST_NAME = "VPR TRIAL LEADS" +# Allow list ID to be set directly via environment variable (more reliable than lookup) +APOLLO_TRIAL_LIST_ID = os.environ.get("APOLLO_TRIAL_LIST_ID", None) +# Sequence ID for adding contacts to email sequences (preferred over lists) +APOLLO_TRIAL_SEQUENCE_ID = os.environ.get("APOLLO_TRIAL_SEQUENCE_ID", None) + + +async def get_list_id(list_name: Optional[str] = None) -> Optional[str]: + """ + Get Apollo list ID. First tries environment variable, then attempts API lookup. + + Args: + list_name: Name of the list (for lookup if env var not set) + + Returns: + List ID as string if found, None otherwise + """ + # First, try to use the list ID from environment variable (most reliable) + if APOLLO_TRIAL_LIST_ID: + # Apollo list IDs are typically hexadecimal strings (MongoDB ObjectIds) + # Accept them as strings, just strip whitespace + list_id = str(APOLLO_TRIAL_LIST_ID).strip() + if list_id: + print(f"[INFO] Using Apollo list ID from environment variable: {list_id}") + return list_id + else: + print(f"[WARNING] APOLLO_TRIAL_LIST_ID is empty") + + # If no env var, try to look up by name (this may not work if API endpoint is different) + if not list_name or not APOLLO_API_KEY: + return None + + # Note: The /lists endpoint may not be available in all Apollo API versions + # Try alternative: search for lists using a different endpoint + try: + async with httpx.AsyncClient() as client: + # Try the lists endpoint (may return 404 in some API versions) + response = await client.get( + f"{APOLLO_API_URL}/lists", + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "X-Api-Key": APOLLO_API_KEY + }, + timeout=10.0 + ) + if response.status_code == 200: + data = response.json() + lists = data.get("lists", []) + for list_item in lists: + if list_item.get("name") == list_name: + list_id = list_item.get("id") + print(f"[INFO] Found Apollo list '{list_name}' with ID: {list_id}") + # Return as string (Apollo IDs are typically hex strings) + return str(list_id) if list_id else None + print(f"[WARNING] Apollo list '{list_name}' not found in available lists") + else: + print(f"[WARNING] Apollo lists endpoint returned {response.status_code}, cannot lookup list by name") + except Exception as e: + print(f"[WARNING] Failed to fetch Apollo list ID: {str(e)}") + + return None + + +async def add_contact_to_sequence(contact_id: str, sequence_id: str) -> bool: + """ + Add a contact to an Apollo.io email sequence. + + Args: + contact_id: The Apollo contact ID + sequence_id: The Apollo sequence ID + + Returns: + True if contact was successfully added to sequence, False otherwise + """ + if not APOLLO_API_KEY: + print("[WARNING] APOLLO_API_KEY not set, skipping sequence enrollment") + return False + + try: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{APOLLO_API_URL}/sequence_contacts", + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "X-Api-Key": APOLLO_API_KEY + }, + json={ + "sequence_id": sequence_id, + "contact_id": contact_id + }, + timeout=10.0 + ) + + if response.status_code in [200, 201]: + print(f"[INFO] Successfully added contact {contact_id} to sequence {sequence_id}") + return True + else: + error_data = response.text + print(f"[ERROR] Failed to add contact to sequence: {response.status_code} - {error_data}") + return False + except httpx.HTTPStatusError as e: + print(f"[ERROR] Apollo API HTTP error adding to sequence: {e.response.status_code} - {e.response.text}") + return False + except Exception as e: + print(f"[ERROR] Failed to add contact to sequence: {str(e)}") + return False + + +async def create_apollo_contact( + email: str, + first_name: Optional[str] = None, + last_name: Optional[str] = None, + organization_name: Optional[str] = None, + title: Optional[str] = None, + list_name: Optional[str] = None, + sequence_id: Optional[str] = None +) -> bool: + """ + Create a contact in Apollo.io and optionally add to a sequence or list. + + Args: + email: Contact email address (required) + first_name: Contact first name + last_name: Contact last name + organization_name: Organization name + title: Job title + list_name: Name of the list to add contact to (defaults to APOLLO_TRIAL_LIST_NAME) + sequence_id: ID of the sequence to add contact to (preferred over list) + + Returns: + True if contact created successfully, False otherwise + + Raises: + ValueError: If APOLLO_API_KEY is not set + """ + if not APOLLO_API_KEY: + print("[WARNING] APOLLO_API_KEY not set, skipping Apollo contact creation") + return False + + # Use default list name if not provided + if list_name is None: + list_name = APOLLO_TRIAL_LIST_NAME + + # Parse name if full name is provided but first/last are not + if not first_name and not last_name: + # Try to extract from email or use email prefix + email_prefix = email.split('@')[0] + if '.' in email_prefix: + parts = email_prefix.split('.') + first_name = parts[0].capitalize() if parts else None + last_name = parts[1].capitalize() if len(parts) > 1 else None + else: + first_name = email_prefix.capitalize() + + # Extract organization domain from email + organization_domain = None + if '@' in email: + organization_domain = email.split('@')[1] + + # Prepare contact data + contact_data: Dict[str, Any] = { + "email": email.lower(), + "run_dedupe": True # Prevent duplicate contacts + } + + if first_name: + contact_data["first_name"] = first_name + if last_name: + contact_data["last_name"] = last_name + if organization_name: + contact_data["organization_name"] = organization_name + if organization_domain: + contact_data["organization_domain"] = organization_domain + if title: + contact_data["title"] = title + + try: + async with httpx.AsyncClient() as client: + # Get the list ID if list_name is provided + list_ids = [] + target_list_id = None # Store for later use + if list_name: + list_id = await get_list_id(list_name) + if list_id: + target_list_id = list_id # Store for verification later + # Apollo API accepts list_ids as an array of strings (hex IDs) + list_ids = [str(list_id)] + contact_data["list_ids"] = list_ids + print(f"[INFO] Adding contact to list ID: {list_id}") + else: + print(f"[WARNING] Could not find list '{list_name}'. Set APOLLO_TRIAL_LIST_ID environment variable with the list ID, or create contact without list assignment") + + # Log the payload being sent (for debugging) + print(f"[DEBUG] Creating Apollo contact with payload: {contact_data}") + + # Create the contact + response = await client.post( + f"{APOLLO_API_URL}/contacts", + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "X-Api-Key": APOLLO_API_KEY + }, + json=contact_data, + timeout=10.0 + ) + + # Log the full response for debugging + print(f"[DEBUG] Apollo API response status: {response.status_code}") + try: + response_json = response.json() + print(f"[DEBUG] Apollo API response (full): {response_json}") + except: + print(f"[DEBUG] Apollo API response body (text): {response.text[:1000]}") # First 1000 chars + + if response.status_code == 200 or response.status_code == 201: + result = response.json() + contact = result.get("contact", {}) + contact_id = contact.get("id") + print(f"[INFO] Successfully created Apollo contact: {email} (ID: {contact_id})") + + # Priority: Add to sequence if sequence_id is provided (this is supported by API) + target_sequence_id = sequence_id or APOLLO_TRIAL_SEQUENCE_ID + if contact_id and target_sequence_id: + print(f"[INFO] Adding contact to sequence: {target_sequence_id}") + sequence_success = await add_contact_to_sequence(contact_id, target_sequence_id) + if sequence_success: + print(f"[INFO] ✓ Contact successfully enrolled in sequence") + else: + print(f"[WARNING] Failed to add contact to sequence, but contact was created") + + # Fallback: Try to add to list (API limitation - may not work) + if list_ids and contact_id and target_list_id and not target_sequence_id: + print(f"[INFO] Contact created with list_ids parameter: {list_ids}") + print(f"[INFO] ⚠️ Apollo.io API Limitation: The API does not return list_ids in responses,") + print(f"[INFO] so we cannot verify if the contact was added to the list via API.") + print(f"[INFO] Please verify manually in Apollo.io that contact '{email}' is in list '{list_name or target_list_id}'") + print(f"[INFO] Consider using sequences instead (APOLLO_TRIAL_SEQUENCE_ID) for better API support.") + + return True + else: + error_data = response.text + print(f"[ERROR] Failed to create Apollo contact: {response.status_code} - {error_data}") + return False + + except httpx.HTTPStatusError as e: + print(f"[ERROR] Apollo API HTTP error: {e.response.status_code} - {e.response.text}") + return False + except Exception as e: + print(f"[ERROR] Failed to create Apollo contact: {str(e)}") + return False + + +async def enrich_contact_by_email(email: str) -> Optional[Dict[str, Any]]: + """ + Enrich contact data from Apollo.io using email address. + + Args: + email: Contact email address + + Returns: + Dictionary with enriched contact data, or None if not found + """ + if not APOLLO_API_KEY: + print("[WARNING] APOLLO_API_KEY not set, skipping Apollo enrichment") + return None + + try: + async with httpx.AsyncClient() as client: + # Try people/match endpoint first (for exact email match) + print(f"[DEBUG] Attempting Apollo.io enrichment for {email} via /people/match endpoint") + response = await client.post( + f"{APOLLO_API_URL}/people/match", + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "X-Api-Key": APOLLO_API_KEY + }, + json={ + "email": email.lower() + # Note: reveal_phone_number requires webhook_url, so we skip it for now + }, + timeout=10.0 + ) + + print(f"[DEBUG] Apollo.io /people/match response status: {response.status_code}") + + if response.status_code == 200: + data = response.json() + print(f"[DEBUG] Apollo.io /people/match response data keys: {list(data.keys())}") + person = data.get("person", {}) + if person: + print(f"[DEBUG] Found person data in Apollo.io response") + # Extract enriched data + enriched_data = { + "first_name": person.get("first_name"), + "last_name": person.get("last_name"), + "title": person.get("title"), + "phone_number": person.get("phone_numbers", [{}])[0].get("raw_number") if person.get("phone_numbers") else None, + "linkedin_url": person.get("linkedin_url"), + "headline": person.get("headline"), + "organization_name": person.get("organization", {}).get("name") if person.get("organization") else None, + "organization_website": person.get("organization", {}).get("website_url") if person.get("organization") else None, + "organization_address": None, # May need to parse from organization data + } + + # Try to get organization address + if person.get("organization"): + org = person.get("organization", {}) + address_parts = [] + if org.get("street_address"): + address_parts.append(org.get("street_address")) + if org.get("city"): + address_parts.append(org.get("city")) + if org.get("state"): + address_parts.append(org.get("state")) + if org.get("postal_code"): + address_parts.append(org.get("postal_code")) + if org.get("country"): + address_parts.append(org.get("country")) + if address_parts: + enriched_data["organization_address"] = ", ".join(address_parts) + + print(f"[INFO] Successfully enriched contact data for {email} from Apollo.io") + return enriched_data + else: + print(f"[DEBUG] Apollo.io /people/match returned 200 but no person data found") + elif response.status_code == 404: + print(f"[DEBUG] Apollo.io /people/match returned 404 - contact not found in database") + elif response.status_code == 401: + print(f"[ERROR] Apollo.io API authentication failed - check your API key") + try: + error_data = response.json() + print(f"[ERROR] Apollo.io error details: {error_data}") + except: + print(f"[ERROR] Apollo.io error response: {response.text}") + else: + print(f"[DEBUG] Apollo.io /people/match returned status {response.status_code}") + try: + error_data = response.json() + print(f"[DEBUG] Apollo.io response: {error_data}") + except: + print(f"[DEBUG] Apollo.io response text: {response.text[:500]}") + + # If match fails, try the new search endpoint (api_search) + print(f"[DEBUG] Attempting Apollo.io enrichment for {email} via /mixed_people/api_search endpoint") + search_response = await client.post( + f"{APOLLO_API_URL}/mixed_people/api_search", + headers={ + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "X-Api-Key": APOLLO_API_KEY + }, + json={ + "email": email.lower(), + "per_page": 1 + }, + timeout=10.0 + ) + + print(f"[DEBUG] Apollo.io /mixed_people/api_search response status: {search_response.status_code}") + + if search_response.status_code == 200: + search_data = search_response.json() + print(f"[DEBUG] Apollo.io /mixed_people/api_search response data keys: {list(search_data.keys())}") + people = search_data.get("people", []) + print(f"[DEBUG] Found {len(people)} people in search results") + if people: + person = people[0] + # Extract enriched data (same structure as above) + enriched_data = { + "first_name": person.get("first_name"), + "last_name": person.get("last_name"), + "title": person.get("title"), + "phone_number": person.get("phone_numbers", [{}])[0].get("raw_number") if person.get("phone_numbers") else None, + "linkedin_url": person.get("linkedin_url"), + "headline": person.get("headline"), + "organization_name": person.get("organization", {}).get("name") if person.get("organization") else None, + "organization_website": person.get("organization", {}).get("website_url") if person.get("organization") else None, + "organization_address": None, + } + + if person.get("organization"): + org = person.get("organization", {}) + address_parts = [] + if org.get("street_address"): + address_parts.append(org.get("street_address")) + if org.get("city"): + address_parts.append(org.get("city")) + if org.get("state"): + address_parts.append(org.get("state")) + if org.get("postal_code"): + address_parts.append(org.get("postal_code")) + if org.get("country"): + address_parts.append(org.get("country")) + if address_parts: + enriched_data["organization_address"] = ", ".join(address_parts) + + print(f"[INFO] Successfully enriched contact data for {email} from Apollo.io (via search)") + return enriched_data + else: + print(f"[DEBUG] Apollo.io /mixed_people/api_search returned 200 but no people in results") + elif search_response.status_code == 404: + print(f"[DEBUG] Apollo.io /mixed_people/api_search returned 404 - contact not found") + elif search_response.status_code == 401: + print(f"[ERROR] Apollo.io API authentication failed on search - check your API key") + try: + error_data = search_response.json() + print(f"[ERROR] Apollo.io search error details: {error_data}") + except: + print(f"[ERROR] Apollo.io search error response: {search_response.text}") + else: + print(f"[DEBUG] Apollo.io /mixed_people/api_search returned status {search_response.status_code}") + try: + error_data = search_response.json() + print(f"[DEBUG] Apollo.io search response: {error_data}") + except: + print(f"[DEBUG] Apollo.io search response text: {search_response.text[:500]}") + + print(f"[INFO] No contact data found in Apollo.io for {email} - contact may not exist in Apollo's database") + return None + + except httpx.HTTPStatusError as e: + print(f"[ERROR] Apollo API HTTP error during enrichment: {e.response.status_code} - {e.response.text}") + return None + except Exception as e: + print(f"[ERROR] Failed to enrich contact from Apollo.io: {str(e)}") + return None + diff --git a/backend/app/auth.py b/backend/app/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..23bc391cd29d72705a230ad96759426143858d36 --- /dev/null +++ b/backend/app/auth.py @@ -0,0 +1,92 @@ +import os +import jwt +from datetime import datetime, timedelta +from typing import Optional +from fastapi import Depends, HTTPException, status +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from sqlalchemy.orm import Session +from .db import SessionLocal +from .models import User + +# JWT Configuration +SECRET_KEY = os.environ.get("JWT_SECRET_KEY", "your-secret-key-change-in-production") +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = 60 * 24 * 7 # 7 days + +security = HTTPBearer() + + +def get_db(): + """Database dependency.""" + db = SessionLocal() + try: + yield db + finally: + db.close() + + +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): + """Create a JWT access token.""" + to_encode = data.copy() + # Ensure 'sub' (subject) is a string, not an integer + if "sub" in to_encode: + to_encode["sub"] = str(to_encode["sub"]) + if expires_delta: + expire = datetime.utcnow() + expires_delta + else: + expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES) + to_encode.update({"exp": expire}) + encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) + return encoded_jwt + + +def verify_token(token: str) -> dict: + """Verify and decode a JWT token.""" + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + return payload + except jwt.ExpiredSignatureError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Token has expired", + ) + except jwt.InvalidTokenError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + ) + + +def get_current_user( + credentials: HTTPAuthorizationCredentials = Depends(security), + db: Session = Depends(get_db) +) -> User: + """Get the current authenticated user from JWT token.""" + token = credentials.credentials + payload = verify_token(token) + user_id: int = payload.get("sub") + + if user_id is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + ) + + # Convert user_id back to integer for database query + try: + user_id_int = int(user_id) + except (ValueError, TypeError): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid user ID in token", + ) + + user = db.query(User).filter(User.id == user_id_int).first() + if user is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="User not found", + ) + + return user + diff --git a/backend/app/auth_routes.py b/backend/app/auth_routes.py new file mode 100644 index 0000000000000000000000000000000000000000..f886754af32ffd1b17be05395642c72288103034 --- /dev/null +++ b/backend/app/auth_routes.py @@ -0,0 +1,347 @@ +import os +from fastapi import APIRouter, Depends, HTTPException, Body +from pydantic import BaseModel, EmailStr +from sqlalchemy.orm import Session +from .models import User, APIKey +from .auth import create_access_token, get_current_user +from .firebase_auth import verify_firebase_token +from .otp_service import request_otp, verify_otp +from .email_validator import validate_business_email, is_business_email +from .api_key_auth import generate_api_key, hash_api_key, get_api_key_prefix +from .db import SessionLocal + +def get_db(): + """Database dependency.""" + db = SessionLocal() + try: + yield db + finally: + db.close() + +router = APIRouter() + + +class FirebaseLoginRequest(BaseModel): + id_token: str + + +class OTPRequestRequest(BaseModel): + email: EmailStr + + +class OTPVerifyRequest(BaseModel): + email: EmailStr + otp: str + + +class CreateAPIKeyRequest(BaseModel): + name: str # User-friendly name for the API key + + +@router.post("/api/auth/firebase/login") +async def firebase_login( + request: FirebaseLoginRequest, + db: Session = Depends(get_db) +): + """ + Login with Firebase ID token. + Validates business email and creates/updates user. + """ + try: + # Verify Firebase token + user_info = await verify_firebase_token(request.id_token) + email = user_info.get('email') + + if not email: + raise HTTPException(status_code=400, detail="Email not found in Firebase token") + + # Validate business email + if not is_business_email(email): + raise HTTPException( + status_code=400, + detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address." + ) + + # Get or create user + user = db.query(User).filter( + (User.email == email.lower()) | (User.firebase_uid == user_info['uid']) + ).first() + + if not user: + user = User( + email=email.lower(), + name=user_info.get('name'), + picture=user_info.get('picture'), + firebase_uid=user_info['uid'], + auth_method='firebase', + email_verified=True + ) + db.add(user) + db.commit() + db.refresh(user) + print(f"[INFO] New user created via Firebase: {email}") + + # Enrich contact data from Apollo.io and update Brevo + Monday.com + try: + from .apollo_service import enrich_contact_by_email + from .brevo_service import create_brevo_contact, BREVO_TRIAL_LIST_ID + from .monday_service import create_monday_lead + + # Enrich contact data from Apollo.io + enriched_data = await enrich_contact_by_email(email) + + # Use enriched data if available, otherwise use basic data + first_name = enriched_data.get("first_name") if enriched_data else None + last_name = enriched_data.get("last_name") if enriched_data else None + org_name = enriched_data.get("organization_name") if enriched_data else None + + # Fallback to Firebase data if Apollo didn't provide it + if not first_name or not last_name: + full_name = user_info.get('name', '') + if full_name: + name_parts = full_name.strip().split(' ', 1) + first_name = first_name or (name_parts[0] if name_parts else None) + last_name = last_name or (name_parts[1] if len(name_parts) > 1 else None) + + if not org_name: + org_domain = email.split('@')[1] if '@' in email else None + org_name = org_domain.split('.')[0].capitalize() if org_domain else None + + # Update Brevo contact with enriched data + await create_brevo_contact( + email=email, + first_name=first_name, + last_name=last_name, + organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None), + phone_number=enriched_data.get("phone_number") if enriched_data else None, + linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None, + title=enriched_data.get("title") if enriched_data else None, + headline=enriched_data.get("headline") if enriched_data else None, + organization_website=enriched_data.get("organization_website") if enriched_data else None, + organization_address=enriched_data.get("organization_address") if enriched_data else None, + list_id=BREVO_TRIAL_LIST_ID + ) + + # Create lead in Monday.com + await create_monday_lead( + email=email, + first_name=first_name, + last_name=last_name, + phone_number=enriched_data.get("phone_number") if enriched_data else None, + linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None, + title=enriched_data.get("title") if enriched_data else None, + headline=enriched_data.get("headline") if enriched_data else None, + organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None), + organization_website=enriched_data.get("organization_website") if enriched_data else None, + organization_address=enriched_data.get("organization_address") if enriched_data else None, + ) + except Exception as e: + # Don't fail user creation if integrations fail + print(f"[WARNING] Failed to enrich/update contact for {email}: {str(e)}") + else: + # Update user info + user.firebase_uid = user_info['uid'] + user.email_verified = True + user.name = user_info.get('name', user.name) + user.picture = user_info.get('picture', user.picture) + if user.auth_method != 'firebase': + user.auth_method = 'firebase' + db.commit() + print(f"[INFO] User logged in via Firebase: {email}") + + # Generate JWT token + token = create_access_token(data={"sub": user.id}) + + return { + "token": token, + "user": { + "id": user.id, + "email": user.email, + "name": user.name, + "picture": user.picture, + "auth_method": user.auth_method + } + } + except HTTPException: + raise + except Exception as e: + print(f"[ERROR] Firebase login failed: {str(e)}") + raise HTTPException(status_code=400, detail=f"Authentication failed: {str(e)}") + + +@router.post("/api/auth/otp/request") +async def request_otp_endpoint( + request: OTPRequestRequest, + db: Session = Depends(get_db) +): + """ + Request OTP for email login. + Validates business email before sending OTP. + """ + try: + # Validate business email + validate_business_email(request.email) + + # Request OTP + result = await request_otp(request.email, db) + return result + except HTTPException: + raise + except Exception as e: + print(f"[ERROR] OTP request failed: {str(e)}") + raise HTTPException(status_code=500, detail=f"Failed to send OTP: {str(e)}") + + +@router.post("/api/auth/otp/verify") +async def verify_otp_endpoint( + request: OTPVerifyRequest, + db: Session = Depends(get_db) +): + """ + Verify OTP and login. + Validates business email and OTP code. + """ + try: + # Validate business email + validate_business_email(request.email) + + # Verify OTP + user = await verify_otp(request.email, request.otp, db) + + # Generate JWT token + token = create_access_token(data={"sub": user.id}) + + return { + "token": token, + "user": { + "id": user.id, + "email": user.email, + "name": user.name, + "picture": user.picture, + "auth_method": user.auth_method + } + } + except HTTPException: + raise + except Exception as e: + print(f"[ERROR] OTP verification failed: {str(e)}") + raise HTTPException(status_code=400, detail=f"OTP verification failed: {str(e)}") + + +@router.get("/api/auth/me") +async def get_current_user_info(current_user: User = Depends(get_current_user)): + """Get current user information.""" + return { + "id": current_user.id, + "email": current_user.email, + "name": current_user.name, + "picture": current_user.picture, + "auth_method": current_user.auth_method, + } + + +@router.post("/api/auth/api-key/create") +async def create_api_key( + request: CreateAPIKeyRequest, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db) +): + """ + Create a new API key for the current user. + Returns the API key (only shown once - store it securely!). + """ + if not request.name or not request.name.strip(): + raise HTTPException(status_code=400, detail="API key name is required") + + # Generate new API key + api_key = generate_api_key() + key_hash = hash_api_key(api_key) + key_prefix = get_api_key_prefix(api_key) + + # Create API key record + api_key_record = APIKey( + user_id=current_user.id, + name=request.name.strip(), + key_hash=key_hash, + key_prefix=key_prefix, + is_active=True + ) + db.add(api_key_record) + db.commit() + db.refresh(api_key_record) + + print(f"[INFO] API key created for user {current_user.email}: {key_prefix}") + + return { + "success": True, + "api_key": api_key, # Only returned once - user must save this! + "key_id": api_key_record.id, + "key_prefix": key_prefix, + "name": api_key_record.name, + "created_at": api_key_record.created_at.isoformat() if api_key_record.created_at else None, + "message": "API key created successfully. Store this key securely - it will not be shown again!" + } + + +@router.get("/api/auth/api-keys") +async def list_api_keys( + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db) +): + """ + List all API keys for the current user. + Only shows key prefix, not the full key for security. + """ + api_keys = ( + db.query(APIKey) + .filter(APIKey.user_id == current_user.id) + .order_by(APIKey.created_at.desc()) + .all() + ) + + return { + "success": True, + "api_keys": [ + { + "id": key.id, + "name": key.name, + "key_prefix": key.key_prefix, + "is_active": key.is_active, + "last_used_at": key.last_used_at.isoformat() if key.last_used_at else None, + "created_at": key.created_at.isoformat() if key.created_at else None, + } + for key in api_keys + ] + } + + +@router.delete("/api/auth/api-key/{key_id}") +async def delete_api_key( + key_id: int, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db) +): + """ + Delete (deactivate) an API key. + """ + api_key = ( + db.query(APIKey) + .filter(APIKey.id == key_id) + .filter(APIKey.user_id == current_user.id) + .first() + ) + + if not api_key: + raise HTTPException(status_code=404, detail="API key not found") + + # Soft delete by deactivating + api_key.is_active = False + db.commit() + + print(f"[INFO] API key {api_key.key_prefix} deactivated for user {current_user.email}") + + return { + "success": True, + "message": "API key deactivated successfully" + } + diff --git a/backend/app/brevo_service.py b/backend/app/brevo_service.py new file mode 100644 index 0000000000000000000000000000000000000000..e8c36ce0c0ef54ad303de7e6625ff965e7a3b531 --- /dev/null +++ b/backend/app/brevo_service.py @@ -0,0 +1,486 @@ +""" +Brevo (formerly Sendinblue) email service for sending transactional emails. +Reference: https://developers.brevo.com/reference/sendtransacemail +""" +import os +import httpx +from typing import Optional, Dict, Any +from difflib import SequenceMatcher + +BREVO_API_KEY = os.environ.get("BREVO_API_KEY", "") +BREVO_API_URL = "https://api.brevo.com/v3/smtp/email" +BREVO_SENDER_EMAIL = os.environ.get("BREVO_SENDER_EMAIL", "noreply@yourdomain.com") +BREVO_SENDER_NAME = os.environ.get("BREVO_SENDER_NAME", "EZOFIS AI") +BREVO_TRIAL_LIST_ID = int(os.environ.get("BREVO_TRIAL_LIST_ID", "5")) # Default to 5 for "VRP Trials" + +# Brevo standard attribute names mapping +BREVO_ATTRIBUTE_MAP = { + "first_name": "FIRSTNAME", + "last_name": "LASTNAME", + "organization_name": "COMPANY", + "phone_number": "SMS", + "linkedin_url": "LINKEDIN", + "title": "JOB_TITLE", + "headline": "HEADLINE", + "organization_website": "WEBSITE", + "organization_address": "ADDRESS", + # Common variations + "firstname": "FIRSTNAME", + "fname": "FIRSTNAME", + "given_name": "FIRSTNAME", + "lastname": "LASTNAME", + "lname": "LASTNAME", + "surname": "LASTNAME", + "family_name": "LASTNAME", + "company": "COMPANY", + "org": "COMPANY", + "organization": "COMPANY", + "phone": "SMS", + "mobile": "SMS", + "telephone": "SMS", + "linkedin": "LINKEDIN", + "linkedin_profile": "LINKEDIN", + "job_title": "JOB_TITLE", + "position": "JOB_TITLE", + "role": "JOB_TITLE", + "website": "WEBSITE", + "url": "WEBSITE", + "web": "WEBSITE", + "address": "ADDRESS", + "location": "ADDRESS", +} + + +def _get_brevo_attribute_name(field_name: str) -> Optional[str]: + """ + Get Brevo attribute name for a given field name using semantic matching. + + Args: + field_name: Field name (e.g., "first_name", "email") + + Returns: + Brevo attribute name (e.g., "FIRSTNAME") or None if not found + """ + # Normalize field name + normalized = field_name.lower().replace("_", "").replace("-", "") + + # Direct lookup first + if field_name.lower() in BREVO_ATTRIBUTE_MAP: + return BREVO_ATTRIBUTE_MAP[field_name.lower()] + + if normalized in BREVO_ATTRIBUTE_MAP: + return BREVO_ATTRIBUTE_MAP[normalized] + + # Semantic matching using similarity + best_match = None + best_score = 0.0 + + for key, value in BREVO_ATTRIBUTE_MAP.items(): + score = SequenceMatcher(None, normalized, key.lower()).ratio() + if score > best_score: + best_score = score + best_match = value + + # Only return if similarity is high enough + if best_score >= 0.6: + return best_match + + return None + + +async def send_otp_email(email: str, otp: str) -> bool: + """ + Send OTP email using Brevo transactional email API. + + Args: + email: Recipient email address + otp: One-time password code + + Returns: + True if email sent successfully + + Raises: + ValueError: If BREVO_API_KEY is not set + Exception: If email sending fails + """ + if not BREVO_API_KEY: + raise ValueError("BREVO_API_KEY environment variable is not set") + + # Brevo API payload structure + payload = { + "sender": { + "name": BREVO_SENDER_NAME, + "email": BREVO_SENDER_EMAIL + }, + "to": [ + { + "email": email + } + ], + "subject": "Your OTP Code for EZOFIS AI", + "htmlContent": f""" + + + + + + + + +
+
+

Hello,

+

You requested a one-time password (OTP) to sign in to your EZOFIS account.

+
+
Your OTP code is:
+
{otp}
+
+

This code will expire in 10 minutes.

+
+ ⚠️ Security Notice: If you didn't request this code, please ignore this email. Do not share this code with anyone. +
+ +
+
+ + + """, + "textContent": f""" +Your OTP Code for EZOFIS AI + +Hello, + +You requested a one-time password (OTP) to sign in to your EZOFIS account. + +Your OTP code is: {otp} + +This code will expire in 10 minutes. + +⚠️ Security Notice: If you didn't request this code, please ignore this email. Do not share this code with anyone. + +© EZOFIS - Agentic Intelligence Platform +This is an automated message, please do not reply. + """ + } + + headers = { + "accept": "application/json", + "api-key": BREVO_API_KEY, + "content-type": "application/json" + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post(BREVO_API_URL, json=payload, headers=headers) + response.raise_for_status() + + result = response.json() + message_id = result.get('messageId', 'N/A') + print(f"[INFO] Brevo email sent successfully to {email}. Message ID: {message_id}") + return True + except httpx.HTTPStatusError as e: + error_detail = {} + try: + error_detail = e.response.json() if e.response else {} + except: + error_detail = {"message": str(e)} + + error_msg = error_detail.get('message', f'HTTP {e.response.status_code}' if e.response else 'Unknown error') + print(f"[ERROR] Brevo API error: {e.response.status_code if e.response else 'N/A'} - {error_msg}") + raise Exception(f"Failed to send email via Brevo: {error_msg}") + except httpx.TimeoutException: + print(f"[ERROR] Brevo API request timed out") + raise Exception("Email service timeout. Please try again.") + except Exception as e: + print(f"[ERROR] Brevo email sending failed: {str(e)}") + raise Exception(f"Failed to send email: {str(e)}") + + +async def send_share_email(recipient_email: str, sender_email: str, share_link: str, sender_name: str = None) -> bool: + """ + Send share email using Brevo transactional email API. + + Args: + recipient_email: Recipient email address + sender_email: Sender email address + share_link: Share link URL + sender_name: Sender's display name (optional, falls back to email if not provided) + + Returns: + True if email sent successfully + + Raises: + ValueError: If BREVO_API_KEY is not set + Exception: If email sending fails + """ + if not BREVO_API_KEY: + raise ValueError("BREVO_API_KEY environment variable is not set") + + # Get base URL from environment or use default + base_url = os.environ.get("VITE_API_BASE_URL", "https://seth0330-ezofisocr.hf.space") + + # Determine sender display name: use sender_name if available, otherwise extract from email + # This is the logged-in user's name, NOT the email sender name (BREVO_SENDER_NAME) + # BREVO_SENDER_NAME is only used for the "From" field, not the email body + if sender_name and sender_name.strip(): + # Use the actual logged-in user's name + sender_display = sender_name.strip() + print(f"[INFO] Using user's name from database: {sender_display}") + else: + # Extract name from email (part before @) and format it nicely + email_name = sender_email.split('@')[0] + # Handle cases like "seth.smith" -> "Seth Smith" or "seth_smith" -> "Seth Smith" + if '.' in email_name: + parts = email_name.split('.') + sender_display = ' '.join(part.capitalize() for part in parts) + elif '_' in email_name: + parts = email_name.split('_') + sender_display = ' '.join(part.capitalize() for part in parts) + else: + # Simple case: "seth" -> "Seth" + sender_display = email_name.capitalize() + print(f"[INFO] Extracted name from email: {sender_display} (from {sender_email})") + + # Brevo API payload structure + # Note: BREVO_SENDER_NAME is used only for the "From" field in the email header + # The email body uses sender_display (the logged-in user's name) + payload = { + "sender": { + "name": BREVO_SENDER_NAME, + "email": BREVO_SENDER_EMAIL + }, + "to": [ + { + "email": recipient_email + } + ], + "subject": f"{sender_display} shared a document extraction with you", + "htmlContent": f""" + + + + + + + + +
+
+

Hello,

+

{sender_display} wants you to take a look at a document extraction output.

+ +

You'll need to sign in to your EZOFIS account to view this extraction. If you don't have an account, you can create one using the link above.

+ +
+
+ + + """, + "textContent": f""" +{sender_display} shared a document extraction with you + +Hello, + +{sender_display} wants you to take a look at a document extraction output. + +View the shared extraction: {share_link} + +You'll need to sign in to your EZOFIS account to view this extraction. If you don't have an account, you can create one using the link above. + +© EZOFIS - Agentic Intelligence Platform +This is an automated message, please do not reply. + """ + } + + headers = { + "accept": "application/json", + "api-key": BREVO_API_KEY, + "content-type": "application/json" + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post(BREVO_API_URL, json=payload, headers=headers) + response.raise_for_status() + + result = response.json() + message_id = result.get('messageId', 'N/A') + print(f"[INFO] Brevo share email sent successfully to {recipient_email}. Message ID: {message_id}") + return True + except httpx.HTTPStatusError as e: + error_detail = {} + try: + error_detail = e.response.json() if e.response else {} + except: + error_detail = {"message": str(e)} + + error_msg = error_detail.get('message', f'HTTP {e.response.status_code}' if e.response else 'Unknown error') + print(f"[ERROR] Brevo API error: {e.response.status_code if e.response else 'N/A'} - {error_msg}") + raise Exception(f"Failed to send email via Brevo: {error_msg}") + except httpx.TimeoutException: + print(f"[ERROR] Brevo API request timed out") + raise Exception("Email service timeout. Please try again.") + except Exception as e: + print(f"[ERROR] Brevo email sending failed: {str(e)}") + raise Exception(f"Failed to send email: {str(e)}") + + +async def create_brevo_contact( + email: str, + first_name: Optional[str] = None, + last_name: Optional[str] = None, + organization_name: Optional[str] = None, + phone_number: Optional[str] = None, + linkedin_url: Optional[str] = None, + title: Optional[str] = None, + headline: Optional[str] = None, + organization_website: Optional[str] = None, + organization_address: Optional[str] = None, + list_id: Optional[int] = None +) -> bool: + """ + Create a contact in Brevo and optionally add to a list. + + Args: + email: Contact email address (required) + first_name: Contact first name + last_name: Contact last name + organization_name: Organization name + phone_number: Phone number + linkedin_url: LinkedIn profile URL + title: Job title + headline: Professional headline + organization_website: Company website + organization_address: Company address + list_id: ID of the list to add contact to (e.g., 5 for "VRP Trials") + + Returns: + True if contact created successfully, False otherwise + """ + if not BREVO_API_KEY: + print("[WARNING] BREVO_API_KEY not set, skipping Brevo contact creation") + return False + + # Prepare contact attributes using automatic field mapping + attributes = {} + + # Map all fields automatically + field_mappings = { + "first_name": first_name, + "last_name": last_name, + "organization_name": organization_name, + "phone_number": phone_number, + "linkedin_url": linkedin_url, + "title": title, + "headline": headline, + "organization_website": organization_website, + "organization_address": organization_address, + } + + for field_name, field_value in field_mappings.items(): + if field_value: + brevo_attr = _get_brevo_attribute_name(field_name) + if brevo_attr: + attributes[brevo_attr] = str(field_value).strip() # Ensure it's a string and trimmed + print(f"[DEBUG] Mapped '{field_name}' ({field_value}) to Brevo attribute '{brevo_attr}'") + else: + print(f"[DEBUG] No Brevo attribute mapping found for '{field_name}'") + else: + print(f"[DEBUG] Skipping '{field_name}' - value is empty/None") + + print(f"[DEBUG] Final Brevo attributes to send: {attributes}") + + # Prepare contact data + contact_data = { + "email": email.lower(), + "updateEnabled": True # Update existing contact if email already exists + } + + if attributes: + contact_data["attributes"] = attributes + + # Add to list if list_id is provided + if list_id: + contact_data["listIds"] = [list_id] + + headers = { + "accept": "application/json", + "api-key": BREVO_API_KEY, + "content-type": "application/json" + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + "https://api.brevo.com/v3/contacts", + json=contact_data, + headers=headers + ) + + if response.status_code in [200, 201, 204]: + print(f"[INFO] Successfully created Brevo contact: {email}" + + (f" and added to list {list_id}" if list_id else "")) + return True + elif response.status_code == 400: + # Contact might already exist, try to update it + try: + error_data = response.json() + if "already exists" in str(error_data).lower(): + print(f"[INFO] Contact {email} already exists in Brevo, updating...") + # Use PUT to update existing contact + update_response = await client.put( + f"https://api.brevo.com/v3/contacts/{email.lower()}", + json=contact_data, + headers=headers + ) + if update_response.status_code in [200, 204]: + print(f"[INFO] Successfully updated Brevo contact: {email}" + + (f" and added to list {list_id}" if list_id else "")) + return True + except: + pass + + error_data = response.text + print(f"[ERROR] Failed to create Brevo contact: {response.status_code} - {error_data}") + return False + else: + error_data = response.text + print(f"[ERROR] Failed to create Brevo contact: {response.status_code} - {error_data}") + return False + + except httpx.HTTPStatusError as e: + print(f"[ERROR] Brevo API HTTP error: {e.response.status_code} - {e.response.text}") + return False + except Exception as e: + print(f"[ERROR] Failed to create Brevo contact: {str(e)}") + return False + diff --git a/backend/app/db.py b/backend/app/db.py index a72b2f2467a1c70b8eb6cd97826f12f0b6f9daf7..b0badc86d624e866d673d7b92aa0121f22c6e25c 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -1,18 +1,18 @@ -import os -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker, declarative_base - -# SQLite DB path. In Docker / HF we’ll set DB_PATH env, default is local "data/app.db" -DB_PATH = os.environ.get("DB_PATH", "data/app.db") - -# Create SQLAlchemy engine -engine = create_engine( - f"sqlite:///{DB_PATH}", - connect_args={"check_same_thread": False}, -) - -# Session factory -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -# Base model class -Base = declarative_base() +import os +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, declarative_base + +# SQLite DB path. In Docker / HF we’ll set DB_PATH env, default is local "data/app.db" +DB_PATH = os.environ.get("DB_PATH", "data/app.db") + +# Create SQLAlchemy engine +engine = create_engine( + f"sqlite:///{DB_PATH}", + connect_args={"check_same_thread": False}, +) + +# Session factory +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +# Base model class +Base = declarative_base() diff --git a/backend/app/email_validator.py b/backend/app/email_validator.py new file mode 100644 index 0000000000000000000000000000000000000000..4d659674ebbb8178400e2d3d27639a2e6b6fd797 --- /dev/null +++ b/backend/app/email_validator.py @@ -0,0 +1,61 @@ +""" +Email validation utilities to ensure only business emails are allowed. +""" +from fastapi import HTTPException + +# List of personal email domains to block +PERSONAL_EMAIL_DOMAINS = { + 'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', + 'aol.com', 'icloud.com', 'mail.com', 'protonmail.com', + 'yandex.com', 'zoho.com', 'gmx.com', 'live.com', 'msn.com', + 'me.com', 'mac.com', 'yahoo.co.uk', 'yahoo.co.jp', 'yahoo.fr', + 'yahoo.de', 'yahoo.it', 'yahoo.es', 'yahoo.in', 'yahoo.com.au', + 'gmail.co.uk', 'gmail.fr', 'gmail.de', 'gmail.it', 'gmail.es', + 'gmail.in', 'gmail.com.au', 'hotmail.co.uk', 'hotmail.fr', + 'hotmail.de', 'hotmail.it', 'hotmail.es', 'outlook.co.uk', + 'outlook.fr', 'outlook.de', 'outlook.it', 'outlook.es', + 'rediffmail.com', 'sina.com', 'qq.com', '163.com', '126.com', + 'mail.ru', 'inbox.com', 'fastmail.com', 'tutanota.com', + 'hey.com', 'pm.me' +} + + +def is_business_email(email: str) -> bool: + """ + Check if email is a business email (not personal). + + Args: + email: Email address to validate + + Returns: + True if business email, False if personal email + """ + if not email or '@' not in email: + return False + + domain = email.split('@')[1].lower().strip() + return domain not in PERSONAL_EMAIL_DOMAINS + + +def validate_business_email(email: str) -> None: + """ + Raise exception if email is not a business email. + + Args: + email: Email address to validate + + Raises: + HTTPException: If email is a personal email domain + """ + if not email: + raise HTTPException( + status_code=400, + detail="Email address is required" + ) + + if not is_business_email(email): + raise HTTPException( + status_code=400, + detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address." + ) + diff --git a/backend/app/firebase_auth.py b/backend/app/firebase_auth.py new file mode 100644 index 0000000000000000000000000000000000000000..3779577473e565fbf5deff622cc05d5ed2bdf351 --- /dev/null +++ b/backend/app/firebase_auth.py @@ -0,0 +1,92 @@ +""" +Firebase Authentication utilities. +""" +import os +import json +import firebase_admin +from firebase_admin import auth, credentials +from fastapi import HTTPException + +# Initialize Firebase Admin SDK +_firebase_initialized = False + +def initialize_firebase(): + """Initialize Firebase Admin SDK.""" + global _firebase_initialized + + if _firebase_initialized: + return + + if not firebase_admin._apps: + # Try to get service account from environment variable (JSON string) + service_account_json = os.environ.get("FIREBASE_SERVICE_ACCOUNT_JSON") + + if service_account_json: + try: + service_account_info = json.loads(service_account_json) + cred = credentials.Certificate(service_account_info) + firebase_admin.initialize_app(cred) + _firebase_initialized = True + print("[INFO] Firebase Admin SDK initialized from environment variable") + return + except json.JSONDecodeError: + print("[WARNING] Failed to parse FIREBASE_SERVICE_ACCOUNT_JSON") + + # Try to get service account from file path + service_account_path = os.environ.get("FIREBASE_SERVICE_ACCOUNT_KEY") + if service_account_path and os.path.exists(service_account_path): + cred = credentials.Certificate(service_account_path) + firebase_admin.initialize_app(cred) + _firebase_initialized = True + print(f"[INFO] Firebase Admin SDK initialized from file: {service_account_path}") + return + + # Try to use default credentials (for Google Cloud environments) + try: + firebase_admin.initialize_app() + _firebase_initialized = True + print("[INFO] Firebase Admin SDK initialized with default credentials") + return + except Exception as e: + print(f"[WARNING] Firebase initialization failed: {e}") + raise HTTPException( + status_code=500, + detail="Firebase not configured. Please set FIREBASE_SERVICE_ACCOUNT_JSON or FIREBASE_SERVICE_ACCOUNT_KEY environment variable." + ) + + +async def verify_firebase_token(id_token: str) -> dict: + """ + Verify Firebase ID token and return user info. + + Args: + id_token: Firebase ID token from client + + Returns: + Dictionary with user information (uid, email, name, picture) + + Raises: + HTTPException: If token is invalid + """ + initialize_firebase() + + try: + decoded_token = auth.verify_id_token(id_token) + + return { + 'uid': decoded_token['uid'], + 'email': decoded_token.get('email'), + 'name': decoded_token.get('name'), + 'picture': decoded_token.get('picture'), + } + except ValueError as e: + raise HTTPException( + status_code=401, + detail=f"Invalid Firebase token: {str(e)}" + ) + except Exception as e: + raise HTTPException( + status_code=401, + detail=f"Firebase authentication failed: {str(e)}" + ) + diff --git a/backend/app/main.py b/backend/app/main.py index 7d245081d133305eb8fbc0d178fcc2a343fcd639..eaf9af424ce32a34d1959735c211387bcc92b1a5 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,292 +1,786 @@ -import os -import time -from typing import List, Dict - -from fastapi import FastAPI, UploadFile, File, Depends -from fastapi.middleware.cors import CORSMiddleware -from fastapi.staticfiles import StaticFiles -from sqlalchemy.orm import Session - -from .db import Base, engine, SessionLocal -from .models import ExtractionRecord -from .schemas import ExtractionRecordBase, ExtractionStage -from .openrouter_client import extract_fields_from_document - -# Ensure data dir exists for SQLite -os.makedirs("data", exist_ok=True) - -# Create tables -Base.metadata.create_all(bind=engine) - -app = FastAPI(title="Document Capture Demo – Backend") - -# CORS (for safety we allow all; you can tighten later) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - - -@app.get("/ping") -def ping(): - """Healthcheck.""" - return {"status": "ok", "message": "backend alive"} - - -def make_stages(total_ms: int, status: str) -> Dict[str, ExtractionStage]: - """ - Build synthetic stage timing data for the History UI. - For now we just split total_ms into 4 stages. - """ - if total_ms <= 0: - total_ms = 1000 - - return { - "uploading": ExtractionStage( - time=int(total_ms * 0.15), - status="completed", - variation="normal", - ), - "aiAnalysis": ExtractionStage( - time=int(total_ms * 0.55), - status="completed" if status == "completed" else "failed", - variation="normal", - ), - "dataExtraction": ExtractionStage( - time=int(total_ms * 0.2), - status="completed" if status == "completed" else "skipped", - variation="fast", - ), - "outputRendering": ExtractionStage( - time=int(total_ms * 0.1), - status="completed" if status == "completed" else "skipped", - variation="normal", - ), - } - - -@app.post("/api/extract") -async def extract_document( - file: UploadFile = File(...), - db: Session = Depends(get_db), -): - """ - Main extraction endpoint used by the Dashboard. - 1) Read the uploaded file - 2) Call OpenRouter + Qwen3-VL - 3) Store a record in SQLite - 4) Return extraction result + metadata - """ - start = time.time() - content = await file.read() - content_type = file.content_type or "application/octet-stream" - size_mb = len(content) / 1024 / 1024 - size_str = f"{size_mb:.2f} MB" - - try: - print(f"[INFO] Starting extraction for file: {file.filename}, type: {content_type}, size: {size_str}") - extracted = await extract_fields_from_document(content, content_type, file.filename) - total_ms = int((time.time() - start) * 1000) - - print(f"[INFO] Extraction completed. Response keys: {list(extracted.keys())}") - print(f"[INFO] Fields extracted: {extracted.get('fields', {})}") - - confidence = float(extracted.get("confidence", 90)) - fields = extracted.get("fields", {}) - - # Get full_text for text output - full_text = extracted.get("full_text", "") - if full_text: - full_text_words = len(str(full_text).split()) - print(f"[INFO] Full text extracted: {full_text_words} words") - - # Check if fields contain structured data (from table parsing) - # If fields is a dict with page_X keys, it's already structured - # If fields is empty or simple, add full_text and pages for text display - if not fields or (isinstance(fields, dict) and not any(k.startswith("page_") for k in fields.keys())): - if full_text: - fields["full_text"] = full_text - - # Also check for pages array - pages_data = extracted.get("pages", []) - if pages_data and isinstance(pages_data, list): - print(f"[INFO] Extracted text from {len(pages_data)} page(s)") - fields["pages"] = pages_data - - # Count fields - if structured data exists, count table rows + metadata - if isinstance(fields, dict): - # Check if it's structured page data - if any(k.startswith("page_") for k in fields.keys()): - # Count structured fields (metadata keys + table rows) - page_data = list(fields.values())[0] if len(fields) == 1 else fields - if isinstance(page_data, dict): - table_rows = page_data.get("table", []) - metadata_keys = len(page_data.get("metadata", {})) - fields_extracted = len(table_rows) + metadata_keys - print(f"[INFO] Structured data: {len(table_rows)} table rows, {metadata_keys} metadata fields") - else: - fields_extracted = len(fields) - else: - # Regular fields count (excluding full_text and pages) - fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]]) - else: - fields_extracted = 0 - - print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}") - - status = "completed" - error_message = None - except Exception as e: - import traceback - total_ms = int((time.time() - start) * 1000) - confidence = 0.0 - fields = {} - fields_extracted = 0 - status = "failed" - error_message = str(e) - print(f"[ERROR] Extraction failed: {error_message}") - print(f"[ERROR] Traceback: {traceback.format_exc()}") - - # Save record to DB - rec = ExtractionRecord( - file_name=file.filename, - file_type=content_type, - file_size=size_str, - status=status, - confidence=confidence, - fields_extracted=fields_extracted, - total_time_ms=total_ms, - raw_output=str(fields), - error_message=error_message, - ) - db.add(rec) - db.commit() - db.refresh(rec) - - stages = make_stages(total_ms, status) - - # Response shape that frontend will consume - return { - "id": rec.id, - "fileName": rec.file_name, - "fileType": rec.file_type, - "fileSize": rec.file_size, - "status": status, - "confidence": confidence, - "fieldsExtracted": fields_extracted, - "totalTime": total_ms, - "fields": fields, - "stages": {k: v.dict() for k, v in stages.items()}, - "errorMessage": error_message, - } - - -@app.get("/api/history", response_model=List[ExtractionRecordBase]) -def get_history(db: Session = Depends(get_db)): - """ - Used by the History page. - Returns last 100 records, with synthetic stage data. - """ - recs = ( - db.query(ExtractionRecord) - .order_by(ExtractionRecord.created_at.desc()) - .limit(100) - .all() - ) - - output: List[ExtractionRecordBase] = [] - for r in recs: - stages = make_stages(r.total_time_ms or 1000, r.status or "completed") - output.append( - ExtractionRecordBase( - id=r.id, - fileName=r.file_name, - fileType=r.file_type or "", - fileSize=r.file_size or "", - extractedAt=r.created_at, - status=r.status or "completed", - confidence=r.confidence or 0.0, - fieldsExtracted=r.fields_extracted or 0, - totalTime=r.total_time_ms or 0, - stages=stages, - errorMessage=r.error_message, - ) - ) - return output - - -# Static frontend mounting (used after we build React) -# Dockerfile copies the Vite build into backend/frontend_dist -# IMPORTANT: API routes must be defined BEFORE this so they take precedence -frontend_dir = os.path.join( - os.path.dirname(os.path.dirname(__file__)), "frontend_dist" -) - -if os.path.isdir(frontend_dir): - # Serve static files (JS, CSS, images, etc.) from assets directory - assets_dir = os.path.join(frontend_dir, "assets") - if os.path.isdir(assets_dir): - app.mount( - "/assets", - StaticFiles(directory=assets_dir), - name="assets", - ) - - # Serve static files from root (logo.png, favicon.ico, etc.) - # Files in public/ directory are copied to dist/ root during Vite build - # These routes must be defined BEFORE the catch-all route - @app.get("/logo.png") - async def serve_logo(): - """Serve logo.png from frontend_dist root.""" - from fastapi.responses import FileResponse - logo_path = os.path.join(frontend_dir, "logo.png") - if os.path.exists(logo_path): - return FileResponse(logo_path, media_type="image/png") - from fastapi import HTTPException - raise HTTPException(status_code=404) - - @app.get("/favicon.ico") - async def serve_favicon(): - """Serve favicon.ico from frontend_dist root.""" - from fastapi.responses import FileResponse - favicon_path = os.path.join(frontend_dir, "favicon.ico") - if os.path.exists(favicon_path): - return FileResponse(favicon_path, media_type="image/x-icon") - from fastapi import HTTPException - raise HTTPException(status_code=404) - - # Catch-all route to serve index.html for React Router - # This must be last so API routes and static files are matched first - @app.get("/{full_path:path}") - async def serve_frontend(full_path: str): - """ - Serve React app for all non-API routes. - React Router will handle client-side routing. - """ - # Skip API routes, docs, static assets, and known static files - if (full_path.startswith("api/") or - full_path.startswith("docs") or - full_path.startswith("openapi.json") or - full_path.startswith("assets/") or - full_path in ["logo.png", "favicon.ico"]): - from fastapi import HTTPException - raise HTTPException(status_code=404) - - # Serve index.html for all other routes (React Router will handle routing) - from fastapi.responses import FileResponse - index_path = os.path.join(frontend_dir, "index.html") - if os.path.exists(index_path): - return FileResponse(index_path) - from fastapi import HTTPException - raise HTTPException(status_code=404) +import os +import time +from typing import List, Dict, Optional + +from fastapi import FastAPI, UploadFile, File, Depends, Form, HTTPException, Body +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from sqlalchemy.orm import Session +from pydantic import BaseModel + +from .db import Base, engine, SessionLocal +from .models import ExtractionRecord, User, ShareToken +from .schemas import ExtractionRecordBase, ExtractionStage +from .openrouter_client import extract_fields_from_document +from .auth import get_current_user, get_db, verify_token +from .auth_routes import router as auth_router +from .api_key_auth import get_user_from_api_key +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from typing import Optional + +# Allowed file types +ALLOWED_CONTENT_TYPES = [ + "application/pdf", + "image/png", + "image/jpeg", + "image/jpg", + "image/tiff", + "image/tif" +] + +# Allowed file extensions (for fallback validation) +ALLOWED_EXTENSIONS = [".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif"] + +# Maximum file size: 4 MB +MAX_FILE_SIZE = 4 * 1024 * 1024 # 4 MB in bytes + +# Ensure data dir exists for SQLite +os.makedirs("data", exist_ok=True) + +# Create tables +Base.metadata.create_all(bind=engine) + +app = FastAPI(title="Document Capture Demo – Backend") + +# Include auth routes +app.include_router(auth_router) + +# CORS (for safety we allow all; you can tighten later) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + + +async def get_current_user_or_api_key_user( + api_key_user: Optional[User] = Depends(get_user_from_api_key), + credentials: Optional[HTTPAuthorizationCredentials] = Depends(HTTPBearer(auto_error=False)), + db: Session = Depends(get_db), +) -> User: + """ + Flexible authentication: supports both JWT Bearer token and API key. + Tries API key first, then falls back to JWT if no API key provided. + """ + # If API key authentication succeeded, use that + if api_key_user: + return api_key_user + + # Otherwise, try JWT authentication + if credentials: + try: + from .auth import verify_token + token = credentials.credentials + payload = verify_token(token) + user_id = int(payload.get("sub")) + user = db.query(User).filter(User.id == user_id).first() + if user: + return user + except Exception: + pass # Will raise HTTPException below + + # If neither worked, raise authentication error + raise HTTPException( + status_code=401, + detail="Authentication required. Provide either a Bearer token or X-API-Key header.", + headers={"WWW-Authenticate": "Bearer"}, + ) + + +@app.get("/ping") +def ping(): + """Healthcheck.""" + return {"status": "ok", "message": "backend alive"} + + +def make_stages(total_ms: int, status: str) -> Dict[str, ExtractionStage]: + """ + Build synthetic stage timing data for the History UI. + For now we just split total_ms into 4 stages. + """ + if total_ms <= 0: + total_ms = 1000 + + return { + "uploading": ExtractionStage( + time=int(total_ms * 0.15), + status="completed", + variation="normal", + ), + "aiAnalysis": ExtractionStage( + time=int(total_ms * 0.55), + status="completed" if status == "completed" else "failed", + variation="normal", + ), + "dataExtraction": ExtractionStage( + time=int(total_ms * 0.2), + status="completed" if status == "completed" else "skipped", + variation="fast", + ), + "outputRendering": ExtractionStage( + time=int(total_ms * 0.1), + status="completed" if status == "completed" else "skipped", + variation="normal", + ), + } + + +@app.post("/api/extract") +async def extract_document( + file: UploadFile = File(...), + key_fields: Optional[str] = Form(None), + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user_or_api_key_user), +): + """ + Main extraction endpoint for document parsing. + Supports both JWT Bearer token and API key authentication. + + Authentication methods: + 1. JWT Bearer token: Header "Authorization: Bearer " + 2. API Key: Header "X-API-Key: " + + Parameters: + - file: Document file (PDF, PNG, JPEG, TIFF) - max 4MB + - key_fields: Optional comma-separated list of specific fields to extract (e.g., "Invoice Number,Invoice Date") + + Returns JSON with extracted fields, text, confidence, and metadata. + """ + start = time.time() + content = await file.read() + content_type = file.content_type or "application/octet-stream" + file_size = len(content) + size_mb = file_size / 1024 / 1024 + size_str = f"{size_mb:.2f} MB" + + # Convert file content to base64 for storage + import base64 + file_base64 = base64.b64encode(content).decode("utf-8") + + # Validate file size + if file_size > MAX_FILE_SIZE: + raise HTTPException( + status_code=400, + detail=f"File size exceeds 4 MB limit. Your file is {size_mb:.2f} MB." + ) + + # Validate file type + file_extension = "" + if file.filename: + file_extension = "." + file.filename.split(".")[-1].lower() + + is_valid_type = ( + content_type in ALLOWED_CONTENT_TYPES or + file_extension in ALLOWED_EXTENSIONS + ) + + if not is_valid_type: + raise HTTPException( + status_code=400, + detail="Only PDF, PNG, JPG, and TIFF files are allowed." + ) + + try: + print(f"[INFO] Starting extraction for file: {file.filename}, type: {content_type}, size: {size_str}") + if key_fields: + print(f"[INFO] Key fields requested: {key_fields}") + extracted = await extract_fields_from_document(content, content_type, file.filename, key_fields) + total_ms = int((time.time() - start) * 1000) + + print(f"[INFO] Extraction completed. Response keys: {list(extracted.keys())}") + print(f"[INFO] Fields extracted: {extracted.get('fields', {})}") + + confidence = float(extracted.get("confidence", 90)) + fields = extracted.get("fields", {}) + + # Get Fields from root level (if user provided key_fields) + root_fields = extracted.get("Fields", {}) + + # Get full_text for text output + full_text = extracted.get("full_text", "") + if full_text: + full_text_words = len(str(full_text).split()) + print(f"[INFO] Full text extracted: {full_text_words} words") + + # Check if fields contain structured data (from table parsing) + # If fields is a dict with page_X keys, it's already structured + # If fields is empty or simple, add full_text and pages for text display + if not fields or (isinstance(fields, dict) and not any(k.startswith("page_") for k in fields.keys())): + if full_text: + fields["full_text"] = full_text + + # Also check for pages array + pages_data = extracted.get("pages", []) + if pages_data and isinstance(pages_data, list): + print(f"[INFO] Extracted text from {len(pages_data)} page(s)") + fields["pages"] = pages_data + + # Add Fields at root level if it exists + if root_fields: + fields["Fields"] = root_fields + + # Count fields - if structured data exists, count table rows + root Fields + if isinstance(fields, dict): + # Check if it's structured page data + if any(k.startswith("page_") for k in fields.keys()): + # Count table rows from all pages + table_rows_count = 0 + for page_key, page_data in fields.items(): + if page_key.startswith("page_") and isinstance(page_data, dict): + table_rows = page_data.get("table", []) + if isinstance(table_rows, list): + table_rows_count += len(table_rows) + + # Count Fields from root level + fields_keys = 0 + if isinstance(root_fields, dict): + fields_keys = len(root_fields) + + fields_extracted = table_rows_count + fields_keys + print(f"[INFO] Structured data: {table_rows_count} table rows, {fields_keys} extracted fields") + else: + # Regular fields count (excluding full_text, pages, and Fields) + fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages", "Fields"]]) + # Add Fields count if it exists + if isinstance(root_fields, dict): + fields_extracted += len(root_fields) + else: + fields_extracted = 0 + + print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}") + + status = "completed" + error_message = None + except Exception as e: + import traceback + total_ms = int((time.time() - start) * 1000) + confidence = 0.0 + fields = {} + fields_extracted = 0 + status = "failed" + error_message = str(e) + print(f"[ERROR] Extraction failed: {error_message}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") + + # Save record to DB + import json + import base64 + rec = ExtractionRecord( + user_id=current_user.id, + file_name=file.filename, + file_type=content_type, + file_size=size_str, + status=status, + confidence=confidence, + fields_extracted=fields_extracted, + total_time_ms=total_ms, + raw_output=json.dumps(fields), # Use JSON instead of str() to preserve structure + file_base64=file_base64, # Store base64 encoded file for preview + error_message=error_message, + ) + db.add(rec) + db.commit() + db.refresh(rec) + + stages = make_stages(total_ms, status) + + # Response shape that frontend will consume + return { + "id": rec.id, + "fileName": rec.file_name, + "fileType": rec.file_type, + "fileSize": rec.file_size, + "status": status, + "confidence": confidence, + "fieldsExtracted": fields_extracted, + "totalTime": total_ms, + "fields": fields, + "stages": {k: v.dict() for k, v in stages.items()}, + "errorMessage": error_message, + } + + +@app.get("/api/history", response_model=List[ExtractionRecordBase]) +def get_history( + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Used by the History page. + Returns last 100 records for the current user, with synthetic stage data. + """ + recs = ( + db.query(ExtractionRecord) + .filter(ExtractionRecord.user_id == current_user.id) + .order_by(ExtractionRecord.created_at.desc()) + .limit(100) + .all() + ) + + # Deduplicate: if multiple extractions share the same shared_from_extraction_id, + # keep only the most recent one (to prevent duplicates when same extraction is shared multiple times) + seen_shared_ids = set() + deduplicated_recs = [] + for rec in recs: + if rec.shared_from_extraction_id: + # This is a shared extraction + if rec.shared_from_extraction_id not in seen_shared_ids: + seen_shared_ids.add(rec.shared_from_extraction_id) + deduplicated_recs.append(rec) + # Skip duplicates + else: + # Original extraction (not shared), always include + deduplicated_recs.append(rec) + + recs = deduplicated_recs + + output: List[ExtractionRecordBase] = [] + for r in recs: + stages = make_stages(r.total_time_ms or 1000, r.status or "completed") + output.append( + ExtractionRecordBase( + id=r.id, + fileName=r.file_name, + fileType=r.file_type or "", + fileSize=r.file_size or "", + extractedAt=r.created_at, + status=r.status or "completed", + confidence=r.confidence or 0.0, + fieldsExtracted=r.fields_extracted or 0, + totalTime=r.total_time_ms or 0, + stages=stages, + errorMessage=r.error_message, + ) + ) + return output + + +@app.get("/api/extraction/{extraction_id}") +def get_extraction( + extraction_id: int, + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Get a specific extraction by ID with full fields data. + Used when viewing output from History page. + """ + import json + + rec = ( + db.query(ExtractionRecord) + .filter( + ExtractionRecord.id == extraction_id, + ExtractionRecord.user_id == current_user.id + ) + .first() + ) + + if not rec: + from fastapi import HTTPException + raise HTTPException(status_code=404, detail="Extraction not found") + + # Parse the raw_output JSON string back to dict + fields = {} + if rec.raw_output: + try: + # Try parsing as JSON first (new format) + fields = json.loads(rec.raw_output) + except (json.JSONDecodeError, TypeError): + # If that fails, try using ast.literal_eval for old str() format (backward compatibility) + try: + import ast + # Only use literal_eval if it looks like a Python dict string + if rec.raw_output.strip().startswith('{'): + fields = ast.literal_eval(rec.raw_output) + else: + fields = {} + except: + fields = {} + + stages = make_stages(rec.total_time_ms or 1000, rec.status or "completed") + + return { + "id": rec.id, + "fileName": rec.file_name, + "fileType": rec.file_type or "", + "fileSize": rec.file_size or "", + "status": rec.status or "completed", + "confidence": rec.confidence or 0.0, + "fieldsExtracted": rec.fields_extracted or 0, + "totalTime": rec.total_time_ms or 0, + "fields": fields, + "fileBase64": rec.file_base64, # Include base64 encoded file for preview + "stages": {k: v.dict() for k, v in stages.items()}, + "errorMessage": rec.error_message, + } + + +@app.post("/api/share") +async def share_extraction( + extraction_id: int = Body(...), + recipient_emails: List[str] = Body(...), + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Share an extraction with one or more users via email. + Creates share tokens and sends emails to recipients. + """ + import secrets + from datetime import datetime, timedelta + from .brevo_service import send_share_email + from .email_validator import validate_business_email + + # Validate recipient emails list + if not recipient_emails or len(recipient_emails) == 0: + raise HTTPException(status_code=400, detail="At least one recipient email is required") + + # Validate each recipient email is a business email + for email in recipient_emails: + try: + validate_business_email(email) + except HTTPException: + raise # Re-raise HTTPException from validate_business_email + + # Get the extraction record + extraction = ( + db.query(ExtractionRecord) + .filter( + ExtractionRecord.id == extraction_id, + ExtractionRecord.user_id == current_user.id + ) + .first() + ) + + if not extraction: + raise HTTPException(status_code=404, detail="Extraction not found") + + # Generate share link base URL + base_url = os.environ.get("VITE_API_BASE_URL", "https://seth0330-ezofisocr.hf.space") + + # Process each recipient email + successful_shares = [] + failed_shares = [] + share_records = [] + + for recipient_email in recipient_emails: + recipient_email = recipient_email.strip().lower() + + # Generate secure share token for this recipient + share_token = secrets.token_urlsafe(32) + + # Create share token record (expires in 30 days) + expires_at = datetime.utcnow() + timedelta(days=30) + share_record = ShareToken( + token=share_token, + extraction_id=extraction_id, + sender_user_id=current_user.id, + recipient_email=recipient_email, + expires_at=expires_at, + ) + db.add(share_record) + share_records.append((share_record, share_token, recipient_email)) + + # Commit all share tokens + try: + db.commit() + for share_record, share_token, recipient_email in share_records: + db.refresh(share_record) + except Exception as e: + db.rollback() + raise HTTPException(status_code=500, detail=f"Failed to create share tokens: {str(e)}") + + # Send emails to all recipients + for share_record, share_token, recipient_email in share_records: + share_link = f"{base_url}/share/{share_token}" + try: + # Get sender's name from current_user, fallback to None if not available + sender_name = current_user.name if current_user.name else None + await send_share_email(recipient_email, current_user.email, share_link, sender_name) + successful_shares.append(recipient_email) + except Exception as e: + # Log error but continue with other emails + print(f"[ERROR] Failed to send share email to {recipient_email}: {str(e)}") + failed_shares.append(recipient_email) + # Optionally, you could delete the share token if email fails + # db.delete(share_record) + + # Build response message + if len(failed_shares) == 0: + message = f"Extraction shared successfully with {len(successful_shares)} recipient(s)" + elif len(successful_shares) == 0: + raise HTTPException(status_code=500, detail=f"Failed to send share emails to all recipients") + else: + message = f"Extraction shared with {len(successful_shares)} recipient(s). Failed to send to: {', '.join(failed_shares)}" + + return { + "success": True, + "message": message, + "successful_count": len(successful_shares), + "failed_count": len(failed_shares), + "successful_emails": successful_shares, + "failed_emails": failed_shares if failed_shares else None + } + + +class ShareLinkRequest(BaseModel): + extraction_id: int + +@app.post("/api/share/link") +async def create_share_link( + request: ShareLinkRequest, + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Create a shareable link for an extraction without requiring recipient emails. + Returns a share link that can be copied and shared manually. + """ + import secrets + from datetime import datetime, timedelta + + # Get the extraction record + extraction = ( + db.query(ExtractionRecord) + .filter( + ExtractionRecord.id == request.extraction_id, + ExtractionRecord.user_id == current_user.id + ) + .first() + ) + + if not extraction: + raise HTTPException(status_code=404, detail="Extraction not found") + + # Generate secure share token + share_token = secrets.token_urlsafe(32) + + # Create share token record (expires in 30 days, no specific recipient) + expires_at = datetime.utcnow() + timedelta(days=30) + share_record = ShareToken( + token=share_token, + extraction_id=request.extraction_id, + sender_user_id=current_user.id, + recipient_email=None, # None for public share links (copyable links) + expires_at=expires_at, + ) + db.add(share_record) + db.commit() + db.refresh(share_record) + + # Generate share link + base_url = os.environ.get("VITE_API_BASE_URL", "https://seth0330-ezofisocr.hf.space") + share_link = f"{base_url}/share/{share_token}" + + return { + "success": True, + "share_link": share_link, + "share_token": share_token, + "expires_at": expires_at.isoformat() if expires_at else None + } + + +@app.get("/api/share/{token}") +async def access_shared_extraction( + token: str, + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Access a shared extraction and copy it to the current user's account. + This endpoint is called after the user logs in via the share link. + """ + from datetime import datetime + import json + + # Find the share token + share = ( + db.query(ShareToken) + .filter(ShareToken.token == token) + .first() + ) + + if not share: + raise HTTPException(status_code=404, detail="Share link not found or expired") + + # Check if token is expired + if share.expires_at and share.expires_at < datetime.utcnow(): + raise HTTPException(status_code=410, detail="Share link has expired") + + # Get the original extraction + original_extraction = ( + db.query(ExtractionRecord) + .filter(ExtractionRecord.id == share.extraction_id) + .first() + ) + + if not original_extraction: + raise HTTPException(status_code=404, detail="Original extraction not found") + + # Check if already copied for this user (check by share token to prevent duplicates from same share) + # Also check if this specific share token was already used by this user + if share.accessed and share.accessed_by_user_id == current_user.id: + # This share token was already used by this user, find the extraction + existing_copy = ( + db.query(ExtractionRecord) + .filter( + ExtractionRecord.user_id == current_user.id, + ExtractionRecord.shared_from_extraction_id == original_extraction.id + ) + .order_by(ExtractionRecord.created_at.desc()) + .first() + ) + + if existing_copy: + return { + "success": True, + "extraction_id": existing_copy.id, + "message": "Extraction already shared with you" + } + + # Also check if any copy exists for this user from this original extraction + existing_copy = ( + db.query(ExtractionRecord) + .filter( + ExtractionRecord.user_id == current_user.id, + ExtractionRecord.shared_from_extraction_id == original_extraction.id + ) + .first() + ) + + if existing_copy: + # Already copied, mark this share as accessed and return existing extraction ID + share.accessed = True + share.accessed_at = datetime.utcnow() + share.accessed_by_user_id = current_user.id + db.commit() + + return { + "success": True, + "extraction_id": existing_copy.id, + "message": "Extraction already shared with you" + } + + # Copy extraction to current user's account + # Parse the raw_output JSON string back to dict + fields = {} + if original_extraction.raw_output: + try: + fields = json.loads(original_extraction.raw_output) + except (json.JSONDecodeError, TypeError): + try: + import ast + if original_extraction.raw_output.strip().startswith('{'): + fields = ast.literal_eval(original_extraction.raw_output) + else: + fields = {} + except: + fields = {} + + # Create new extraction record for the recipient + new_extraction = ExtractionRecord( + user_id=current_user.id, + file_name=original_extraction.file_name, + file_type=original_extraction.file_type, + file_size=original_extraction.file_size, + status=original_extraction.status or "completed", + confidence=original_extraction.confidence or 0.0, + fields_extracted=original_extraction.fields_extracted or 0, + total_time_ms=original_extraction.total_time_ms or 0, + raw_output=original_extraction.raw_output, # Copy the JSON string + file_base64=original_extraction.file_base64, # Copy the base64 file + shared_from_extraction_id=original_extraction.id, + shared_by_user_id=share.sender_user_id, + ) + db.add(new_extraction) + + # Mark share as accessed + share.accessed = True + share.accessed_at = datetime.utcnow() + share.accessed_by_user_id = current_user.id + + db.commit() + db.refresh(new_extraction) + + return { + "success": True, + "extraction_id": new_extraction.id, + "message": "Extraction shared successfully" + } + + +# Static frontend mounting (used after we build React) +# Dockerfile copies the Vite build into backend/frontend_dist +# IMPORTANT: API routes must be defined BEFORE this so they take precedence +frontend_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "frontend_dist" +) + +if os.path.isdir(frontend_dir): + # Serve static files (JS, CSS, images, etc.) from assets directory + assets_dir = os.path.join(frontend_dir, "assets") + if os.path.isdir(assets_dir): + app.mount( + "/assets", + StaticFiles(directory=assets_dir), + name="assets", + ) + + # Serve static files from root (logo.png, favicon.ico, etc.) + # Files in public/ directory are copied to dist/ root during Vite build + # These routes must be defined BEFORE the catch-all route + @app.get("/logo.png") + async def serve_logo(): + """Serve logo.png from frontend_dist root.""" + from fastapi.responses import FileResponse + logo_path = os.path.join(frontend_dir, "logo.png") + if os.path.exists(logo_path): + return FileResponse(logo_path, media_type="image/png") + from fastapi import HTTPException + raise HTTPException(status_code=404) + + @app.get("/favicon.ico") + async def serve_favicon(): + """Serve favicon.ico from frontend_dist root.""" + from fastapi.responses import FileResponse + favicon_path = os.path.join(frontend_dir, "favicon.ico") + if os.path.exists(favicon_path): + return FileResponse(favicon_path, media_type="image/x-icon") + from fastapi import HTTPException + raise HTTPException(status_code=404) + + # Catch-all route to serve index.html for React Router + # This must be last so API routes and static files are matched first + @app.get("/{full_path:path}") + async def serve_frontend(full_path: str): + """ + Serve React app for all non-API routes. + React Router will handle client-side routing. + """ + # Skip API routes, docs, static assets, and known static files + if (full_path.startswith("api/") or + full_path.startswith("docs") or + full_path.startswith("openapi.json") or + full_path.startswith("assets/") or + full_path in ["logo.png", "favicon.ico"]): + from fastapi import HTTPException + raise HTTPException(status_code=404) + + # Serve index.html for all other routes (React Router will handle routing) + from fastapi.responses import FileResponse + index_path = os.path.join(frontend_dir, "index.html") + if os.path.exists(index_path): + return FileResponse(index_path) + from fastapi import HTTPException + raise HTTPException(status_code=404) diff --git a/backend/app/models.py b/backend/app/models.py index 2045e3ab3a0286a24f6891ecfef949f07521d83d..b9ac2e139945211d9249dfb63ed643acec2e9fb5 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -1,32 +1,136 @@ -from sqlalchemy import Column, Integer, String, Float, DateTime, Text -from sqlalchemy.sql import func - -from .db import Base - - -class ExtractionRecord(Base): - """ - Stores one extraction run so the History page can show past jobs. - We’ll fill it from the /api/extract endpoint later. - """ - - __tablename__ = "extractions" - - id = Column(Integer, primary_key=True, index=True) - - file_name = Column(String, index=True) - file_type = Column(String) - file_size = Column(String) - - status = Column(String) # "completed" | "failed" - confidence = Column(Float) # overall confidence (0–100) - fields_extracted = Column(Integer) # number of fields extracted - total_time_ms = Column(Integer) # total processing time in ms - - raw_output = Column(Text) # JSON string from the model - error_message = Column(Text, nullable=True) - - created_at = Column( - DateTime(timezone=True), - server_default=func.now(), - ) +from sqlalchemy import Column, Integer, String, Float, DateTime, Text, ForeignKey, Boolean +from sqlalchemy.orm import relationship +from sqlalchemy.sql import func + +from .db import Base + + +class User(Base): + """ + Stores user information from Firebase or OTP authentication. + """ + __tablename__ = "users" + + id = Column(Integer, primary_key=True, index=True) + email = Column(String, unique=True, index=True, nullable=False) + name = Column(String, nullable=True) + picture = Column(String, nullable=True) + + # Auth method: 'firebase' or 'otp' + auth_method = Column(String, default='firebase') + + # Firebase-specific + firebase_uid = Column(String, unique=True, index=True, nullable=True) + + # OTP-specific + email_verified = Column(Boolean, default=False) + + created_at = Column( + DateTime(timezone=True), + server_default=func.now(), + ) + + # Relationship to extraction records (explicitly specify user_id as the foreign key) + # Note: primaryjoin must be specified because ExtractionRecord has multiple foreign keys to User + extractions = relationship( + "ExtractionRecord", + back_populates="user", + primaryjoin="User.id == ExtractionRecord.user_id" + ) + + # Relationship to API keys + api_keys = relationship( + "APIKey", + back_populates="user", + cascade="all, delete-orphan" + ) + + +class ExtractionRecord(Base): + """ + Stores one extraction run so the History page can show past jobs. + We'll fill it from the /api/extract endpoint later. + """ + + __tablename__ = "extractions" + + id = Column(Integer, primary_key=True, index=True) + user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True) + + file_name = Column(String, index=True) + file_type = Column(String) + file_size = Column(String) + + status = Column(String) # "completed" | "failed" + confidence = Column(Float) # overall confidence (0–100) + fields_extracted = Column(Integer) # number of fields extracted + total_time_ms = Column(Integer) # total processing time in ms + + raw_output = Column(Text) # JSON string from the model + file_base64 = Column(Text, nullable=True) # Base64 encoded original file for preview + error_message = Column(Text, nullable=True) + + created_at = Column( + DateTime(timezone=True), + server_default=func.now(), + ) + + # Relationship to user (explicitly specify user_id as the foreign key) + # Note: primaryjoin must be specified because ExtractionRecord has multiple foreign keys to User + user = relationship( + "User", + back_populates="extractions", + primaryjoin="ExtractionRecord.user_id == User.id" + ) + + # Track if this extraction was shared (original extraction ID) + shared_from_extraction_id = Column(Integer, ForeignKey("extractions.id"), nullable=True, index=True) + shared_by_user_id = Column(Integer, ForeignKey("users.id"), nullable=True, index=True) + + +class ShareToken(Base): + """ + Stores share tokens for sharing extractions with other users. + """ + __tablename__ = "share_tokens" + + id = Column(Integer, primary_key=True, index=True) + token = Column(String, unique=True, index=True, nullable=False) # Unique share token + extraction_id = Column(Integer, ForeignKey("extractions.id"), nullable=False, index=True) + sender_user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True) + recipient_email = Column(String, nullable=True, index=True) # Nullable for public share links + expires_at = Column(DateTime(timezone=True), nullable=True) # Optional expiration + accessed = Column(Boolean, default=False) # Track if link was accessed + accessed_at = Column(DateTime(timezone=True), nullable=True) + accessed_by_user_id = Column(Integer, ForeignKey("users.id"), nullable=True) + + created_at = Column( + DateTime(timezone=True), + server_default=func.now(), + ) + + +class APIKey(Base): + """ + Stores API keys for external application authentication. + API keys are hashed before storage for security. + """ + __tablename__ = "api_keys" + + id = Column(Integer, primary_key=True, index=True) + user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True) + name = Column(String, nullable=False) # User-friendly name for the API key + key_hash = Column(String, unique=True, index=True, nullable=False) # Hashed API key + key_prefix = Column(String, nullable=False) # First 8 chars of key for display (e.g., "sk_live_") + is_active = Column(Boolean, default=True, nullable=False) + last_used_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column( + DateTime(timezone=True), + server_default=func.now(), + ) + + # Relationship to user + user = relationship( + "User", + back_populates="api_keys" + ) diff --git a/backend/app/monday_service.py b/backend/app/monday_service.py new file mode 100644 index 0000000000000000000000000000000000000000..73e674d46940c0548fa932d57d568fc8d276919e --- /dev/null +++ b/backend/app/monday_service.py @@ -0,0 +1,391 @@ +""" +Monday.com API service for creating leads with automatic field matching. +Reference: https://developer.monday.com/api-reference/docs +""" +import os +import httpx +import json +from typing import Optional, Dict, Any, List, Tuple +from difflib import SequenceMatcher + +MONDAY_API_KEY = os.environ.get("MONDAY_API_KEY", "") +MONDAY_API_URL = "https://api.monday.com/v2" +MONDAY_BOARD_ID = os.environ.get("MONDAY_BOARD_ID", None) # Your "New Leads" board ID + +# Cache for board columns to avoid repeated API calls +_board_columns_cache: Dict[str, List[Dict[str, Any]]] = {} + + +def _calculate_similarity(str1: str, str2: str) -> float: + """ + Calculate similarity between two strings using SequenceMatcher. + Returns a value between 0.0 and 1.0. + """ + return SequenceMatcher(None, str1.lower(), str2.lower()).ratio() + + +def _find_best_column_match( + field_name: str, + available_columns: List[Dict[str, Any]], + min_similarity: float = 0.3 +) -> Optional[Tuple[str, str, float]]: + """ + Find the best matching column for a field name using semantic similarity. + + Args: + field_name: The field name to match (e.g., "first_name", "email") + available_columns: List of column dicts with 'id' and 'title' keys + min_similarity: Minimum similarity threshold (0.0 to 1.0) + + Returns: + Tuple of (column_id, column_title, similarity_score) or None if no match found + """ + best_match = None + best_score = 0.0 + + # Normalize field name for matching + normalized_field = field_name.lower().replace("_", " ").replace("-", " ") + + # Common field name variations + field_variations = [ + normalized_field, + field_name.lower(), + field_name.replace("_", ""), + ] + + # Add common synonyms + synonyms = { + "first_name": ["first name", "firstname", "fname", "given name"], + "last_name": ["last name", "lastname", "lname", "surname", "family name"], + "email": ["email address", "email", "e-mail", "mail"], + "phone_number": ["phone", "phone number", "telephone", "mobile", "cell"], + "linkedin_url": ["linkedin", "linkedin profile", "linkedin url", "linkedin link"], + "title": ["job title", "position", "role", "job"], + "headline": ["headline", "tagline", "bio"], + "organization_name": ["company", "organization", "org", "company name", "employer"], + "organization_website": ["website", "company website", "url", "web"], + "organization_address": ["address", "company address", "location"], + } + + if field_name in synonyms: + field_variations.extend(synonyms[field_name]) + + for column in available_columns: + column_title = column.get("title", "").lower() + column_id = column.get("id", "") + + if not column_title or not column_id: + continue + + # Calculate similarity for each variation + for variation in field_variations: + score = _calculate_similarity(variation, column_title) + if score > best_score: + best_score = score + best_match = (column_id, column.get("title", ""), score) + + if best_match and best_score >= min_similarity: + return best_match + return None + + +async def _get_board_columns(board_id: str) -> List[Dict[str, Any]]: + """ + Fetch board columns from Monday.com API. + + Args: + board_id: Monday.com board ID + + Returns: + List of column dictionaries with 'id', 'title', and 'type' keys + """ + # Check cache first + if board_id in _board_columns_cache: + return _board_columns_cache[board_id] + + if not MONDAY_API_KEY: + print("[WARNING] MONDAY_API_KEY not set, cannot fetch board columns") + return [] + + query = """ + query ($boardId: ID!) { + boards(ids: [$boardId]) { + columns { + id + title + type + } + } + } + """ + + headers = { + "Authorization": MONDAY_API_KEY, + "Content-Type": "application/json" + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + MONDAY_API_URL, + json={ + "query": query, + "variables": {"boardId": board_id} + }, + headers=headers + ) + + if response.status_code == 200: + result = response.json() + if result.get("data") and result["data"].get("boards"): + boards = result["data"]["boards"] + if boards and boards[0].get("columns"): + columns = boards[0]["columns"] + # Cache the result + _board_columns_cache[board_id] = columns + print(f"[INFO] Fetched {len(columns)} columns from Monday.com board {board_id}") + return columns + elif result.get("errors"): + print(f"[ERROR] Failed to fetch board columns: {result['errors']}") + else: + print(f"[ERROR] Failed to fetch board columns: {response.status_code} - {response.text}") + except Exception as e: + print(f"[ERROR] Exception while fetching board columns: {str(e)}") + + return [] + + +def _format_column_value(value: Any, column_type: str, column_id: Optional[str] = None) -> Any: + """ + Format a value according to Monday.com column type. + + Args: + value: The value to format + column_type: Monday.com column type (email, phone, link, text, etc.) + column_id: Column ID (for special handling) + + Returns: + For email/phone/link: Python dict object + For text/other types: Plain string + """ + if value is None: + return "" + + value_str = str(value) + + if column_type == "email": + # Monday.com email format requires dict object (will be JSON encoded later) + return {"email": value_str, "text": value_str} + elif column_type == "phone": + return {"phone": value_str, "countryShortName": "US"} + elif column_type == "link": + # If it's already a URL, use it; otherwise create a link + if value_str.startswith("http://") or value_str.startswith("https://"): + return {"url": value_str, "text": value_str} + else: + return {"url": f"https://{value_str}", "text": value_str} + else: + # Text, status, and other types - just return the string + return value_str + + +async def create_monday_lead( + email: str, + first_name: Optional[str] = None, + last_name: Optional[str] = None, + phone_number: Optional[str] = None, + linkedin_url: Optional[str] = None, + title: Optional[str] = None, + headline: Optional[str] = None, + organization_name: Optional[str] = None, + organization_website: Optional[str] = None, + organization_address: Optional[str] = None, + board_id: Optional[str] = None +) -> bool: + """ + Create a new lead item in Monday.com board. + + Args: + email: Contact email address (required) + first_name: Contact first name + last_name: Contact last name + phone_number: Phone number + linkedin_url: LinkedIn profile URL + title: Job title + headline: Professional headline + organization_name: Company name + organization_website: Company website + organization_address: Company address + board_id: Monday.com board ID as string (defaults to MONDAY_BOARD_ID env var) + + Returns: + True if lead created successfully, False otherwise + """ + if not MONDAY_API_KEY: + print("[WARNING] MONDAY_API_KEY not set, skipping Monday.com lead creation") + return False + + target_board_id = board_id or MONDAY_BOARD_ID + if not target_board_id: + print("[WARNING] MONDAY_BOARD_ID not set, skipping Monday.com lead creation") + return False + + # Prepare item name (use full name or email) + item_name = email + if first_name and last_name: + item_name = f"{first_name} {last_name}" + elif first_name: + item_name = first_name + elif last_name: + item_name = last_name + + # Fetch board columns to automatically match fields + print(f"[INFO] Fetching Monday.com board columns for automatic field matching...") + board_columns = await _get_board_columns(str(target_board_id)) + + if not board_columns: + print("[WARNING] Could not fetch board columns, skipping Monday.com lead creation") + return False + + # Create a mapping of column IDs to column types for formatting + column_types = {col["id"]: col.get("type", "text") for col in board_columns} + + # Prepare data fields to map + data_fields = { + "email": email, + "first_name": first_name, + "last_name": last_name, + "phone_number": phone_number, + "linkedin_url": linkedin_url, + "title": title, + "headline": headline, + "organization_name": organization_name, + "organization_website": organization_website, + "organization_address": organization_address, + } + + # Automatically match fields to columns using semantic similarity + column_values = {} + matched_fields = [] + # Track which columns have been matched to handle duplicates (e.g., first_name and last_name -> Name) + column_matches = {} # column_id -> (field_name, value) + + for field_name, field_value in data_fields.items(): + if not field_value: + continue + + match = _find_best_column_match(field_name, board_columns) + if match: + column_id, column_title, similarity = match + column_type = column_types.get(column_id, "text") + + # Handle special case: if first_name and last_name both match to the same "Name" column + if column_id in column_matches: + existing_field, existing_value = column_matches[column_id] + # If both first_name and last_name match to the same column, combine them + if (field_name in ["first_name", "last_name"] and + existing_field in ["first_name", "last_name"] and + field_name != existing_field): + # Combine first and last name + if field_name == "first_name": + combined_value = f"{field_value} {existing_value}" + else: + combined_value = f"{existing_value} {field_value}" + formatted_value = _format_column_value(combined_value, column_type, column_id) + column_values[column_id] = formatted_value + matched_fields.append(f"{existing_field}+{field_name} -> {column_title} (combined)") + print(f"[INFO] Combined '{existing_field}' and '{field_name}' to column '{column_title}' (ID: {column_id})") + continue + else: + # Different fields matching to same column - use the one with higher similarity + print(f"[DEBUG] Column '{column_title}' already matched to '{existing_field}', skipping '{field_name}'") + continue + + formatted_value = _format_column_value(field_value, column_type, column_id) + column_values[column_id] = formatted_value + column_matches[column_id] = (field_name, field_value) + matched_fields.append(f"{field_name} -> {column_title} (similarity: {similarity:.2f})") + print(f"[INFO] Matched '{field_name}' to column '{column_title}' (ID: {column_id}, type: {column_type}, value: {formatted_value[:100] if len(str(formatted_value)) > 100 else formatted_value})") + else: + print(f"[DEBUG] No suitable column match found for '{field_name}' (skipping)") + + if not column_values: + print("[WARNING] No fields could be matched to board columns") + return False + + print(f"[INFO] Successfully matched {len(matched_fields)} fields to Monday.com columns") + + # Convert column_values to JSON string for GraphQL mutation + # Monday.com expects column values as a JSON string where: + # - Text columns: plain string values + # - Email/Phone/Link columns: dict objects (properly JSON encoded) + column_values_json = json.dumps(column_values) + print(f"[DEBUG] Monday.com column_values JSON: {column_values_json[:500]}") + + # GraphQL mutation + # Note: Monday.com uses ID! (string) type for board_id, not Int! + mutation = """ + mutation ($boardId: ID!, $itemName: String!, $columnValues: JSON!) { + create_item (board_id: $boardId, item_name: $itemName, column_values: $columnValues) { + id + } + } + """ + + # Convert board_id to string (Monday.com expects ID! which is a string) + board_id_str = str(target_board_id) + + variables = { + "boardId": board_id_str, + "itemName": item_name, + "columnValues": column_values_json + } + + headers = { + "Authorization": MONDAY_API_KEY, + "Content-Type": "application/json" + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + MONDAY_API_URL, + json={ + "query": mutation, + "variables": variables + }, + headers=headers + ) + + if response.status_code == 200: + result = response.json() + if result.get("data") and result["data"].get("create_item"): + item_id = result["data"]["create_item"].get("id") + print(f"[INFO] Successfully created Monday.com lead: {item_name} (ID: {item_id})") + return True + elif result.get("errors"): + errors = result.get("errors", []) + for error in errors: + error_msg = error.get("message", "Unknown error") + error_path = error.get("path", []) + print(f"[ERROR] Monday.com API error: {error_msg}") + if error_path: + print(f"[ERROR] Error path: {error_path}") + # Log full error for debugging + print(f"[DEBUG] Full Monday.com error response: {json.dumps(errors, indent=2)}") + return False + else: + print(f"[ERROR] Unexpected Monday.com API response: {result}") + return False + else: + error_data = response.text + print(f"[ERROR] Failed to create Monday.com lead: {response.status_code} - {error_data}") + return False + + except httpx.HTTPStatusError as e: + print(f"[ERROR] Monday.com API HTTP error: {e.response.status_code} - {e.response.text}") + return False + except Exception as e: + print(f"[ERROR] Failed to create Monday.com lead: {str(e)}") + return False + diff --git a/backend/app/openrouter_client.py b/backend/app/openrouter_client.py index 0c8d5bdf03e51c3264a5b2700a071aac48935bd5..062f6f1c3d4986d0183cb74c9b308489981319e1 100644 --- a/backend/app/openrouter_client.py +++ b/backend/app/openrouter_client.py @@ -1,627 +1,862 @@ -import os -import base64 -import json -import re -from io import BytesIO -from typing import Any, Dict, List, Optional, Tuple - -from openai import OpenAI - -try: - import fitz # PyMuPDF - from PIL import Image - PDF_SUPPORT = True -except ImportError as e: - PDF_SUPPORT = False - print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.") - -# OCR Model Configuration (from sample code) -OCR_BASE_URL = os.environ.get("OCR_BASE_URL", "https://od5yev2behke5u-8000.proxy.runpod.net/v1") -OCR_API_KEY = os.environ.get("OCR_API_KEY", "Ezofis@123") -OCR_MODEL_NAME = os.environ.get("OCR_MODEL_NAME", "EZOFISOCR") - -# Initialize OpenAI client with OCR endpoint -ocr_client = OpenAI( - base_url=OCR_BASE_URL, - api_key=OCR_API_KEY, -) - - -def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]: - """ - Convert PDF pages to PNG images. - Returns a list of PNG image bytes, one per page. - """ - if not PDF_SUPPORT: - raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.") - - pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf") - images = [] - - print(f"[INFO] PDF has {len(pdf_doc)} page(s)") - - for page_num in range(len(pdf_doc)): - page = pdf_doc[page_num] - # Render page to image (zoom factor 2 for better quality) - mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality - pix = page.get_pixmap(matrix=mat) - - # Convert to PIL Image then to JPEG bytes (better compression) - img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) - img_bytes = BytesIO() - img.save(img_bytes, format="JPEG", quality=95) - images.append(img_bytes.getvalue()) - - print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})") - - pdf_doc.close() - return images - - -def _image_bytes_to_base64(image_bytes: bytes) -> str: - """Convert image bytes to base64 data URL (JPEG format).""" - b64 = base64.b64encode(image_bytes).decode("utf-8") - data_url = f"data:image/jpeg;base64,{b64}" - print(f"[DEBUG] Base64 encoded image: {len(image_bytes)} bytes -> {len(data_url)} chars") - return data_url - - -def _parse_markdown_table(text: str) -> Optional[Tuple[List[str], List[List[str]]]]: - """ - Parse a markdown table from text. - Returns (headers, rows) if table found, None otherwise. - Handles various table formats including malformed ones. - """ - lines = [line.strip() for line in text.split('\n')] - - # Find potential table start (line with multiple | and actual text content) - table_start = None - for i, line in enumerate(lines): - if '|' in line and line.count('|') >= 2: - # Skip separator lines (only |, -, :, spaces) - if re.match(r'^[\s\|\-:]+$', line): - continue - # Check if line has meaningful text (not just | characters) - cells = [cell.strip() for cell in line.split('|')] - if cells and not cells[0]: - cells = cells[1:] - if cells and not cells[-1]: - cells = cells[:-1] - # Must have at least 2 columns with some text - meaningful_cells = [c for c in cells if len(c) > 0] - if len(meaningful_cells) >= 2: - table_start = i - break - - if table_start is None: - return None - - # Find table end (first non-empty line without | after table start) - table_end = None - for i in range(table_start + 1, len(lines)): - line = lines[i] - if not line: # Empty line, continue - continue - if '|' not in line: - # Non-empty line without | means table ended - table_end = i - break - - if table_end is None: - table_end = len(lines) - - table_lines = lines[table_start:table_end] - - # Find the actual header row (should have meaningful text, not just | or separators) - headers = None - header_idx = None - - for i, line in enumerate(table_lines): - if not line or '|' not in line: - continue - - # Skip separator lines (lines with only |, -, :, spaces) - if re.match(r'^[\s\|\-:]+$', line): - continue - - # Check if this line has meaningful content (not just | characters) - cells = [cell.strip() for cell in line.split('|')] - # Remove empty cells at start/end - if cells and not cells[0]: - cells = cells[1:] - if cells and not cells[-1]: - cells = cells[:-1] - - # Header should have at least 3 columns and meaningful text - if len(cells) >= 3: - # Check if cells have actual text (not just empty or single char) - meaningful_cells = [c for c in cells if len(c) > 1] - if len(meaningful_cells) >= 3: - headers = cells - header_idx = i - break - - if not headers or header_idx is None: - return None - - # Parse data rows (skip separator line after header if present) - rows = [] - num_columns = len(headers) - - for i in range(header_idx + 1, len(table_lines)): - line = table_lines[i] - - if not line: - continue - - # Skip separator lines - if re.match(r'^[\s\|\-:]+$', line): - continue - - if '|' not in line: - # No more table rows - break - - cells = [cell.strip() for cell in line.split('|')] - # Remove empty cells at start/end - if cells and not cells[0]: - cells = cells[1:] - if cells and not cells[-1]: - cells = cells[:-1] - - # Only add rows that match header column count (allow some flexibility) - if len(cells) == num_columns or (len(cells) >= num_columns - 1 and len(cells) <= num_columns + 1): - # Pad or trim to match header count - if len(cells) < num_columns: - cells.extend([''] * (num_columns - len(cells))) - elif len(cells) > num_columns: - cells = cells[:num_columns] - - # Only add if row has at least one non-empty cell - if any(cell for cell in cells): - rows.append(cells) - - if not rows: - return None - - return (headers, rows) - - -def _extract_metadata(text: str) -> Dict[str, str]: - """ - Extract metadata from document header text. - Looks for title, office, notice number, and description. - """ - metadata = { - "title": "", - "office": "", - "notice_no": "", - "description": "" - } - - lines = [line.strip() for line in text.split('\n') if line.strip()] - - # Extract office (usually first non-empty line) - if lines: - metadata["office"] = lines[0] - - # Look for notice number pattern (like "पत्रक सं- 1239" or "सं- 1239") - notice_pattern = r'(?:पत्रक\s+)?सं[-\s:]*(\d+)' - for line in lines[:10]: # Check first 10 lines - match = re.search(notice_pattern, line) - if match: - metadata["notice_no"] = match.group(1) - break - - # Look for title - usually in quotes or contains specific keywords - # Check for quoted text first - quoted_title = re.search(r'["""]([^"""]+)["""]', text[:1000]) - if quoted_title: - metadata["title"] = quoted_title.group(1).strip() - else: - # Look for title patterns - title_keywords = ['सम्पत्ति', 'सूचना', 'विज्ञप्ति', 'नाम परिवर्तन'] - for line in lines[:5]: - if any(keyword in line for keyword in title_keywords): - # Extract the title phrase - title_match = re.search(r'(सम्पत्ति[^।]*|सूचना[^।]*|विज्ञप्ति[^।]*)', line) - if title_match: - metadata["title"] = title_match.group(1).strip() - break - - # Extract description (text before table, usually contains key phrases) - description_keywords = ['नाम परिवर्तन', 'अधिनियम', 'धारा', 'प्रकाशन', 'आवेदन'] - description_parts = [] - for i, line in enumerate(lines[:15]): # Check first 15 lines - if any(keyword in line for keyword in description_keywords): - description_parts.append(line) - # Get a few surrounding lines for context - if i > 0: - description_parts.insert(0, lines[i-1]) - if i < len(lines) - 1: - description_parts.append(lines[i+1]) - break - - if description_parts: - description = ' '.join(description_parts).strip() - if len(description) > 30: # Only if substantial - # Clean up and limit length - description = re.sub(r'\s+', ' ', description) - metadata["description"] = description[:300] # Limit length - - return metadata - - -def _extract_footer_notes(text: str) -> List[str]: - """ - Extract footer notes from document. - Usually appears after the table. - """ - notes = [] - - # Find table end - lines = text.split('\n') - table_end_idx = len(lines) - - for i, line in enumerate(lines): - if '|' in line: - # Find last table line - j = i + 1 - while j < len(lines) and ('|' in lines[j] or re.match(r'^[\s\|\-:]+$', lines[j])): - j += 1 - table_end_idx = j - break - - # Extract footer text (after table) - footer_lines = lines[table_end_idx:] - footer_text = '\n'.join(footer_lines).strip() - - # Split into sentences/notes - # Look for sentences ending with period, exclamation, or specific keywords - sentences = re.split(r'[।\.!]\s+', footer_text) - - for sentence in sentences: - sentence = sentence.strip() - if len(sentence) > 20: # Only substantial notes - # Clean up - sentence = re.sub(r'\s+', ' ', sentence) - if sentence: - notes.append(sentence) - - # Limit to most relevant notes (usually 2-4) - return notes[:5] - - -def _parse_text_with_tables(text: str) -> Dict[str, Any]: - """ - Parse text and extract structured data including tables. - Returns structured JSON format with metadata, table, and footer_notes. - """ - result = { - "text": text, # Keep original text - "metadata": {}, - "table": [], - "footer_notes": [] - } - - # Check if text contains a table - table_data = _parse_markdown_table(text) - - if table_data: - headers, rows = table_data - print(f"[INFO] Found table with {len(headers)} columns and {len(rows)} rows") - - # Extract metadata - result["metadata"] = _extract_metadata(text) - - # Map headers to field names using original header text - # Keep original language, just make valid JSON keys and handle duplicates - header_mapping = {} - header_counts = {} # Track occurrences of each header - - for i, header in enumerate(headers): - header_clean = header.strip() - - # Create a valid JSON key from the original header - # Remove special characters that aren't valid in JSON keys, but keep the text - # Replace spaces and special chars with underscores, but preserve the original text - header_key = header_clean - - # Track how many times we've seen this exact header - if header_key not in header_counts: - header_counts[header_key] = 0 - - header_counts[header_key] += 1 - - # If this header appears multiple times, append a number - if header_counts[header_key] > 1: - header_key = f"{header_key}_{header_counts[header_key]}" - - # Clean the key to be valid for JSON (remove/replace problematic characters) - # Keep the original text but make it JSON-safe - header_key = re.sub(r'[^\w\s\u0900-\u097F]', '', header_key) # Keep Unicode Hindi chars - header_key = re.sub(r'\s+', '_', header_key) # Replace spaces with underscores - - # If key is empty after cleaning, use column index - if not header_key: - header_key = f"column_{i+1}" - - header_mapping[i] = header_key - - # Parse table rows - each row becomes a separate section - table_rows_dict = {} - for idx, row in enumerate(rows, start=1): - row_dict = {} - for i, header_idx in header_mapping.items(): - if i < len(row): - row_dict[header_idx] = row[i].strip() - - if row_dict: - # Each row is a separate section: row_1, row_2, etc. - table_rows_dict[f"row_{idx}"] = row_dict - - # Store rows as separate sections instead of array - result["table"] = table_rows_dict - - # Extract footer notes - result["footer_notes"] = _extract_footer_notes(text) - else: - # No table found, just extract basic metadata - result["metadata"] = _extract_metadata(text) - result["footer_notes"] = _extract_footer_notes(text) - - return result - - -async def _extract_text_with_ocr(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]: - """ - Extract text from a single page/image using the OCR model. - Returns text output in full_text field, keeps fields empty for now. - """ - # Convert image bytes to base64 data URL - data_url = _image_bytes_to_base64(image_bytes) - - print(f"[INFO] OCR: Processing page {page_num}/{total_pages} with model {OCR_MODEL_NAME}") - - try: - # Use OpenAI client with OCR endpoint (as per sample code) - import asyncio - loop = asyncio.get_event_loop() - - # Run the synchronous OpenAI call in executor - response = await loop.run_in_executor( - None, - lambda: ocr_client.chat.completions.create( - model=OCR_MODEL_NAME, - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": "Extract all text from this image"}, - { - "type": "image_url", - "image_url": { - "url": data_url - } - } - ] - } - ], - ) - ) - - # Extract text from response - extracted_text = response.choices[0].message.content - - if not extracted_text: - extracted_text = "" - - print(f"[INFO] OCR: Extracted {len(extracted_text)} characters from page {page_num}") - - # Calculate confidence based on response quality - confidence = _calculate_ocr_confidence(response, extracted_text) - - # Return text in full_text, keep fields empty for now - return { - "doc_type": "other", - "confidence": confidence, - "full_text": extracted_text, - "fields": {} # Keep fields empty for now - } - - except Exception as e: - error_msg = str(e) - print(f"[ERROR] OCR API error for page {page_num}: {error_msg}") - raise RuntimeError(f"OCR API error for page {page_num}: {error_msg}") - - -def _calculate_ocr_confidence(response, extracted_text: str) -> float: - """ - Calculate confidence score based on OCR response quality. - Checks for explicit confidence in response, or calculates based on heuristics. - """ - # Check if response has explicit confidence score - try: - # Check response object for confidence-related fields - if hasattr(response, 'usage'): - # Some models provide usage info that might indicate quality - usage = response.usage - if hasattr(usage, 'completion_tokens') and usage.completion_tokens > 0: - # More tokens might indicate better extraction - pass - - # Check if finish_reason indicates quality - if hasattr(response.choices[0], 'finish_reason'): - finish_reason = response.choices[0].finish_reason - if finish_reason == "stop": - # Normal completion - good sign - base_confidence = 85.0 - elif finish_reason == "length": - # Response was truncated - lower confidence - base_confidence = 70.0 - else: - base_confidence = 75.0 - else: - base_confidence = 85.0 - except Exception: - base_confidence = 85.0 - - # Adjust confidence based on text quality heuristics - text_length = len(extracted_text.strip()) - - if text_length == 0: - return 0.0 - elif text_length < 10: - # Very short text - might be error or empty - return max(30.0, base_confidence - 30.0) - elif text_length < 50: - # Short text - return max(50.0, base_confidence - 15.0) - elif text_length > 1000: - # Long text - likely good extraction - confidence = min(95.0, base_confidence + 10.0) - else: - confidence = base_confidence - - # Check for structured content (tables, etc.) - indicates good extraction - if '|' in extracted_text and extracted_text.count('|') > 5: - # Table detected - boost confidence - confidence = min(95.0, confidence + 5.0) - - # Check for meaningful content (non-whitespace ratio) - non_whitespace = len([c for c in extracted_text if not c.isspace()]) - if text_length > 0: - content_ratio = non_whitespace / text_length - if content_ratio > 0.8: - # High content ratio - good - confidence = min(95.0, confidence + 3.0) - elif content_ratio < 0.3: - # Low content ratio - mostly whitespace - confidence = max(50.0, confidence - 10.0) - - return round(confidence, 1) - - -async def extract_fields_from_document( - file_bytes: bytes, - content_type: str, - filename: str, -) -> Dict[str, Any]: - """ - Extract text from document using OCR model. - Processes pages separately for better reliability. - Returns text output in full_text, keeps JSON/XML fields empty for now. - """ - # Get raw image bytes for processing - if content_type == "application/pdf" or content_type.endswith("/pdf"): - if not PDF_SUPPORT: - raise RuntimeError("PDF support requires PyMuPDF. Please install it.") - # For PDFs, convert to images - pdf_images = _pdf_to_images(file_bytes) - image_bytes_list = pdf_images - else: - # For regular images, process the file bytes - # Convert to JPEG for consistency - try: - img = Image.open(BytesIO(file_bytes)) - if img.mode != "RGB": - img = img.convert("RGB") - - # Resize if too large (max 1920px on longest side) - max_size = 1920 - w, h = img.size - if w > max_size or h > max_size: - if w > h: - new_w = max_size - new_h = int(h * (max_size / w)) - else: - new_h = max_size - new_w = int(w * (max_size / h)) - img = img.resize((new_w, new_h), Image.LANCZOS) - print(f"[INFO] Resized image from {w}x{h} to {new_w}x{new_h}") - - # Convert to JPEG bytes - img_bytes = BytesIO() - img.save(img_bytes, format="JPEG", quality=95) - image_bytes_list = [img_bytes.getvalue()] - except Exception as e: - # Fallback: use original file bytes - print(f"[WARNING] Could not process image with PIL: {e}. Using original bytes.") - image_bytes_list = [file_bytes] - - total_pages = len(image_bytes_list) - print(f"[INFO] Processing {total_pages} page(s) with OCR model...") - - # Process each page separately - page_results = [] - for page_num, img_bytes in enumerate(image_bytes_list): - print(f"[INFO] Processing page {page_num + 1}/{total_pages}...") - try: - page_result = await _extract_text_with_ocr(img_bytes, page_num + 1, total_pages) - page_results.append({ - "page_number": page_num + 1, - "text": page_result.get("full_text", ""), - "fields": page_result.get("fields", {}), - "confidence": page_result.get("confidence", 0), - "doc_type": page_result.get("doc_type", "other"), - }) - print(f"[INFO] Page {page_num + 1} processed successfully") - except Exception as e: - print(f"[ERROR] Failed to process page {page_num + 1}: {e}") - page_results.append({ - "page_number": page_num + 1, - "text": "", - "fields": {}, - "confidence": 0, - "error": str(e) - }) - - # Combine results from all pages - combined_full_text = "\n\n".join([f"=== PAGE {p['page_number']} ===\n\n{p['text']}" for p in page_results if p.get("text")]) - - # Parse each page for tables and structure the output - structured_pages = {} - for page_result in page_results: - if page_result.get("text"): - page_num = page_result.get("page_number", 1) - page_text = page_result.get("text", "") - - # Parse text for tables and structure - parsed_data = _parse_text_with_tables(page_text) - - # Build structured page output - page_key = f"page_{page_num}" - structured_pages[page_key] = { - "text": parsed_data["text"], - "metadata": parsed_data["metadata"], - "table": parsed_data["table"], - "footer_notes": parsed_data["footer_notes"], - "confidence": page_result.get("confidence", 0), - "doc_type": page_result.get("doc_type", "other") - } - - # If we have structured pages, use them; otherwise keep fields empty - if structured_pages: - # Always return pages with page_X keys (even for single page) - combined_fields = structured_pages - else: - combined_fields = {} - - # Calculate average confidence - confidences = [p.get("confidence", 0) for p in page_results if p.get("confidence", 0) > 0] - avg_confidence = sum(confidences) / len(confidences) if confidences else 0 - - # Determine doc_type from first successful page - doc_type = "other" - for page_result in page_results: - if page_result.get("doc_type") and page_result["doc_type"] != "other": - doc_type = page_result["doc_type"] - break - - return { - "doc_type": doc_type, - "confidence": avg_confidence, - "full_text": combined_full_text, - "fields": combined_fields, # Now contains structured data with tables - "pages": page_results - } +import os +import base64 +import json +import re +import time +import asyncio +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple +import httpx + +try: + import fitz # PyMuPDF + from PIL import Image + PDF_SUPPORT = True +except ImportError as e: + PDF_SUPPORT = False + print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.") + + +# RunPod Serverless OCR Configuration +RUNPOD_ENDPOINT = os.environ.get("RUNPOD_ENDPOINT", "https://api.runpod.ai/v2/j2jvf8t6n0rk5c/run") +RUNPOD_API_KEY = os.environ.get("RUNPOD_API_KEY", "rpa_0UJOK33ZO7SID9B3ASFSKKPUHNPBQC5Z2128RB4O4qi9ts") + +# Extract endpoint ID from endpoint URL for status polling +# URL format: https://api.runpod.ai/v2/{endpoint_id}/run +_endpoint_id = RUNPOD_ENDPOINT.split("/v2/")[1].split("/")[0] if "/v2/" in RUNPOD_ENDPOINT else None +RUNPOD_STATUS_ENDPOINT = f"https://api.runpod.ai/v2/{_endpoint_id}/status" if _endpoint_id else None + + +def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]: + """ + Convert PDF pages to PNG images. + Returns a list of PNG image bytes, one per page. + """ + if not PDF_SUPPORT: + raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.") + + pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf") + images = [] + + print(f"[INFO] PDF has {len(pdf_doc)} page(s)") + + for page_num in range(len(pdf_doc)): + page = pdf_doc[page_num] + # Render page to image (zoom factor 2 for better quality) + mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality + pix = page.get_pixmap(matrix=mat) + + # Convert to PIL Image + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # Resize if too large to avoid GPU memory issues (max 1920px on longest side) + max_size = 1920 + w, h = img.size + if w > max_size or h > max_size: + if w > h: + new_w = max_size + new_h = int(h * (max_size / w)) + else: + new_h = max_size + new_w = int(w * (max_size / h)) + img = img.resize((new_w, new_h), Image.LANCZOS) + print(f"[INFO] Resized page {page_num + 1} from {w}x{h} to {new_w}x{new_h}") + else: + print(f"[INFO] Converted page {page_num + 1} to image ({w}x{h})") + + # Convert to JPEG bytes (better compression) + img_bytes = BytesIO() + img.save(img_bytes, format="JPEG", quality=95) + images.append(img_bytes.getvalue()) + + pdf_doc.close() + return images + + +def _image_bytes_to_base64(image_bytes: bytes) -> str: + """Convert image bytes to base64 data URL (JPEG format).""" + b64 = base64.b64encode(image_bytes).decode("utf-8") + data_url = f"data:image/jpeg;base64,{b64}" + print(f"[DEBUG] Base64 encoded image: {len(image_bytes)} bytes -> {len(data_url)} chars") + return data_url + + +def _parse_markdown_table(text: str) -> Optional[Tuple[List[str], List[List[str]]]]: + """ + Parse a markdown table from text. + Returns (headers, rows) if table found, None otherwise. + Handles various table formats including malformed ones. + """ + lines = [line.strip() for line in text.split('\n')] + + # Find potential table start (line with multiple | and actual text content) + table_start = None + for i, line in enumerate(lines): + if '|' in line and line.count('|') >= 2: + # Skip separator lines (only |, -, :, spaces) + if re.match(r'^[\s\|\-:]+$', line): + continue + # Check if line has meaningful text (not just | characters) + cells = [cell.strip() for cell in line.split('|')] + if cells and not cells[0]: + cells = cells[1:] + if cells and not cells[-1]: + cells = cells[:-1] + # Must have at least 2 columns with some text + meaningful_cells = [c for c in cells if len(c) > 0] + if len(meaningful_cells) >= 2: + table_start = i + break + + if table_start is None: + return None + + # Find table end (first non-empty line without | after table start) + table_end = None + for i in range(table_start + 1, len(lines)): + line = lines[i] + if not line: # Empty line, continue + continue + if '|' not in line: + # Non-empty line without | means table ended + table_end = i + break + + if table_end is None: + table_end = len(lines) + + table_lines = lines[table_start:table_end] + + # Find the actual header row (should have meaningful text, not just | or separators) + headers = None + header_idx = None + + for i, line in enumerate(table_lines): + if not line or '|' not in line: + continue + + # Skip separator lines (lines with only |, -, :, spaces) + if re.match(r'^[\s\|\-:]+$', line): + continue + + # Check if this line has meaningful content (not just | characters) + cells = [cell.strip() for cell in line.split('|')] + # Remove empty cells at start/end + if cells and not cells[0]: + cells = cells[1:] + if cells and not cells[-1]: + cells = cells[:-1] + + # Header should have at least 3 columns and meaningful text + if len(cells) >= 3: + # Check if cells have actual text (not just empty or single char) + meaningful_cells = [c for c in cells if len(c) > 1] + if len(meaningful_cells) >= 3: + headers = cells + header_idx = i + break + + if not headers or header_idx is None: + return None + + # Parse data rows (skip separator line after header if present) + rows = [] + num_columns = len(headers) + + for i in range(header_idx + 1, len(table_lines)): + line = table_lines[i] + + if not line: + continue + + # Skip separator lines + if re.match(r'^[\s\|\-:]+$', line): + continue + + if '|' not in line: + # No more table rows + break + + cells = [cell.strip() for cell in line.split('|')] + # Remove empty cells at start/end + if cells and not cells[0]: + cells = cells[1:] + if cells and not cells[-1]: + cells = cells[:-1] + + # Only add rows that match header column count (allow some flexibility) + if len(cells) == num_columns or (len(cells) >= num_columns - 1 and len(cells) <= num_columns + 1): + # Pad or trim to match header count + if len(cells) < num_columns: + cells.extend([''] * (num_columns - len(cells))) + elif len(cells) > num_columns: + cells = cells[:num_columns] + + # Only add if row has at least one non-empty cell + if any(cell for cell in cells): + rows.append(cells) + + if not rows: + return None + + return (headers, rows) + + +def _extract_metadata(text: str) -> Dict[str, str]: + """ + Extract metadata from document header text. + Looks for title, office, notice number, and description. + """ + metadata = { + "title": "", + "office": "", + "notice_no": "", + "description": "" + } + + lines = [line.strip() for line in text.split('\n') if line.strip()] + + # Extract office (usually first non-empty line) + if lines: + metadata["office"] = lines[0] + + # Look for notice number pattern (like "पत्रक सं- 1239" or "सं- 1239") + notice_pattern = r'(?:पत्रक\s+)?सं[-\s:]*(\d+)' + for line in lines[:10]: # Check first 10 lines + match = re.search(notice_pattern, line) + if match: + metadata["notice_no"] = match.group(1) + break + + # Look for title - usually in quotes or contains specific keywords + # Check for quoted text first + quoted_title = re.search(r'["""]([^"""]+)["""]', text[:1000]) + if quoted_title: + metadata["title"] = quoted_title.group(1).strip() + else: + # Look for title patterns + title_keywords = ['सम्पत्ति', 'सूचना', 'विज्ञप्ति', 'नाम परिवर्तन'] + for line in lines[:5]: + if any(keyword in line for keyword in title_keywords): + # Extract the title phrase + title_match = re.search(r'(सम्पत्ति[^।]*|सूचना[^।]*|विज्ञप्ति[^।]*)', line) + if title_match: + metadata["title"] = title_match.group(1).strip() + break + + # Extract description (text before table, usually contains key phrases) + description_keywords = ['नाम परिवर्तन', 'अधिनियम', 'धारा', 'प्रकाशन', 'आवेदन'] + description_parts = [] + for i, line in enumerate(lines[:15]): # Check first 15 lines + if any(keyword in line for keyword in description_keywords): + description_parts.append(line) + # Get a few surrounding lines for context + if i > 0: + description_parts.insert(0, lines[i-1]) + if i < len(lines) - 1: + description_parts.append(lines[i+1]) + break + + if description_parts: + description = ' '.join(description_parts).strip() + if len(description) > 30: # Only if substantial + # Clean up and limit length + description = re.sub(r'\s+', ' ', description) + metadata["description"] = description[:300] # Limit length + + return metadata + + +def _parse_model_response(response_text: str) -> Tuple[str, Dict[str, Any]]: + """ + Parse model response to extract text and metadata. + The model may return text and metadata in various formats. + Returns: (extracted_text, metadata_dict) + """ + metadata = {} + text = response_text + + # Try to find JSON metadata section + # Look for METADATA: or metadata: section + metadata_patterns = [ + r'METADATA:\s*\n?\s*({.*?})(?:\n\n|\nTEXT|$)', + r'metadata:\s*\n?\s*({.*?})(?:\n\n|\nTEXT|$)', + r'METADATA:\s*\n?\s*```json\s*({.*?})\s*```', + r'METADATA:\s*\n?\s*```\s*({.*?})\s*```', + ] + + for pattern in metadata_patterns: + match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE) + if match: + try: + metadata_json = match.group(1).strip() + metadata = json.loads(metadata_json) + # Remove metadata section from text + text = response_text[:match.start()] + response_text[match.end():] + break + except (json.JSONDecodeError, IndexError): + continue + + # If no JSON found, try to extract metadata from structured text format + if not metadata: + # Look for key-value pairs in METADATA section + metadata_section = re.search(r'METADATA:\s*\n(.*?)(?:\n\n|\nTEXT|$)', response_text, re.DOTALL | re.IGNORECASE) + if metadata_section: + metadata_text = metadata_section.group(1) + # Parse key-value pairs + for line in metadata_text.split('\n'): + if ':' in line: + parts = line.split(':', 1) + if len(parts) == 2: + key = parts[0].strip().lower().replace(' ', '_') + value = parts[1].strip() + if value: + metadata[key] = value + + # Extract TEXT section if present + text_match = re.search(r'TEXT:\s*\n(.*?)(?:\n\nMETADATA|$)', response_text, re.DOTALL | re.IGNORECASE) + if text_match: + text = text_match.group(1).strip() + else: + # If no TEXT section, remove METADATA section if found + text = re.sub(r'METADATA:.*', '', response_text, flags=re.DOTALL | re.IGNORECASE).strip() + + # Clean up text + text = text.strip() + + # Clean up metadata - remove empty values + metadata = {k: v for k, v in metadata.items() if v and str(v).strip()} + + return text, metadata + + +def _extract_footer_notes(text: str) -> List[str]: + """ + Extract footer notes from document. + Usually appears after the table. + """ + notes = [] + + # Find table end + lines = text.split('\n') + table_end_idx = len(lines) + + for i, line in enumerate(lines): + if '|' in line: + # Find last table line + j = i + 1 + while j < len(lines) and ('|' in lines[j] or re.match(r'^[\s\|\-:]+$', lines[j])): + j += 1 + table_end_idx = j + break + + # Extract footer text (after table) + footer_lines = lines[table_end_idx:] + footer_text = '\n'.join(footer_lines).strip() + + # Split into sentences/notes + # Look for sentences ending with period, exclamation, or specific keywords + sentences = re.split(r'[।\.!]\s+', footer_text) + + for sentence in sentences: + sentence = sentence.strip() + if len(sentence) > 20: # Only substantial notes + # Clean up + sentence = re.sub(r'\s+', ' ', sentence) + if sentence: + notes.append(sentence) + + # Limit to most relevant notes (usually 2-4) + return notes[:5] + + +def _parse_text_with_tables(text: str, page_metadata: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Parse text and extract structured data including tables. + Uses model-extracted metadata if provided, otherwise falls back to basic extraction. + Returns structured JSON format with metadata, table, and footer_notes. + """ + result = { + "text": text, # Keep original text + "metadata": page_metadata if page_metadata else {}, + "table": [], + "footer_notes": [] + } + + # Check if text contains a table + table_data = _parse_markdown_table(text) + + if table_data: + headers, rows = table_data + print(f"[INFO] Found table with {len(headers)} columns and {len(rows)} rows") + + # Use provided metadata or extract basic metadata as fallback + if not result["metadata"]: + result["metadata"] = _extract_metadata(text) + + # Map headers to field names using original header text + # Keep original language, just make valid JSON keys and handle duplicates + header_mapping = {} + header_counts = {} # Track occurrences of each header + + for i, header in enumerate(headers): + header_clean = header.strip() + + # Create a valid JSON key from the original header + # Remove special characters that aren't valid in JSON keys, but keep the text + # Replace spaces and special chars with underscores, but preserve the original text + header_key = header_clean + + # Track how many times we've seen this exact header + if header_key not in header_counts: + header_counts[header_key] = 0 + + header_counts[header_key] += 1 + + # If this header appears multiple times, append a number + if header_counts[header_key] > 1: + header_key = f"{header_key}_{header_counts[header_key]}" + + # Clean the key to be valid for JSON (remove/replace problematic characters) + # Keep the original text but make it JSON-safe + header_key = re.sub(r'[^\w\s\u0900-\u097F]', '', header_key) # Keep Unicode Hindi chars + header_key = re.sub(r'\s+', '_', header_key) # Replace spaces with underscores + + # If key is empty after cleaning, use column index + if not header_key: + header_key = f"column_{i+1}" + + header_mapping[i] = header_key + + # Parse table rows - each row becomes a separate section + table_rows_dict = {} + for idx, row in enumerate(rows, start=1): + row_dict = {} + for i, header_idx in header_mapping.items(): + if i < len(row): + row_dict[header_idx] = row[i].strip() + + if row_dict: + # Each row is a separate section: row_1, row_2, etc. + table_rows_dict[f"row_{idx}"] = row_dict + + # Store rows as separate sections instead of array + result["table"] = table_rows_dict + + # Extract footer notes + result["footer_notes"] = _extract_footer_notes(text) + else: + # No table found, just extract basic metadata + result["metadata"] = _extract_metadata(text) + result["footer_notes"] = _extract_footer_notes(text) + + return result + + +async def _poll_runpod_job(job_id: str, client: httpx.AsyncClient, max_wait_time: int = 300) -> Dict[str, Any]: + """ + Poll RunPod job status until completion. + Returns the final job result with output. + """ + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {RUNPOD_API_KEY}" + } + + start_time = time.time() + poll_interval = 2 # Poll every 2 seconds + + while True: + # Check timeout + elapsed = time.time() - start_time + if elapsed > max_wait_time: + raise RuntimeError(f"Job {job_id} timed out after {max_wait_time} seconds") + + # Poll job status + status_url = f"{RUNPOD_STATUS_ENDPOINT}/{job_id}" + response = await client.get(status_url, headers=headers) + response.raise_for_status() + status_result = response.json() + + status = status_result.get("status", "").upper() + + if status == "COMPLETED": + print(f"[INFO] Job {job_id} completed successfully") + return status_result + elif status == "FAILED": + error_msg = status_result.get("error", "Unknown error") + raise RuntimeError(f"Job {job_id} failed: {error_msg}") + elif status in ["IN_QUEUE", "IN_PROGRESS"]: + print(f"[INFO] Job {job_id} status: {status}, waiting...") + await asyncio.sleep(poll_interval) + else: + # Unknown status, wait and retry + print(f"[INFO] Job {job_id} status: {status}, waiting...") + await asyncio.sleep(poll_interval) + + +async def _extract_text_with_ocr(image_bytes: bytes, page_num: int, total_pages: int, custom_prompt: str = None) -> Dict[str, Any]: + """ + Extract text and metadata from a single page/image using the RunPod serverless OCR model. + Uses model-driven extraction to identify and extract metadata fields dynamically. + Returns text output in full_text field and extracted metadata. + + Args: + image_bytes: Image bytes to process + page_num: Page number + total_pages: Total number of pages + custom_prompt: Optional custom prompt for field extraction + """ + # Convert image bytes to base64 + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + + print(f"[INFO] OCR: Processing page {page_num}/{total_pages} with RunPod endpoint") + + try: + # Use custom prompt if provided, otherwise use default + if custom_prompt: + metadata_prompt = custom_prompt + else: + # Default prompt for general text extraction + metadata_prompt = """Extract all text from this image.""" + + # Prepare request payload for RunPod + # RunPod serverless endpoints expect image_base64, image_url, or image_path + payload = { + "input": { + "prompt": metadata_prompt, + "image_base64": image_base64 # Base64 encoded image + } + } + + # Make HTTP request to RunPod endpoint + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {RUNPOD_API_KEY}" + } + + async with httpx.AsyncClient(timeout=300.0) as client: + # Submit job + response = await client.post( + RUNPOD_ENDPOINT, + headers=headers, + json=payload + ) + response.raise_for_status() + result = response.json() + + # Check if this is an async job (has job ID and status) + job_id = result.get("id") + status = result.get("status", "").upper() + + if job_id and status in ["IN_QUEUE", "IN_PROGRESS"]: + # This is an async job, need to poll for completion + print(f"[INFO] Job submitted with ID: {job_id}, status: {status}") + if not RUNPOD_STATUS_ENDPOINT: + raise RuntimeError("RunPod status endpoint not configured. Cannot poll async job.") + + # Poll until completion + result = await _poll_runpod_job(job_id, client) + + # Extract text from RunPod response + # RunPod serverless typically returns: {"id": "...", "status": "...", "output": "..."} + # The output might be a string or a dict depending on the model + extracted_text = "" + + if "output" in result: + output = result["output"] + if isinstance(output, str): + extracted_text = output + elif isinstance(output, dict): + # If output is a dict, try common fields + extracted_text = output.get("text", output.get("result", output.get("content", ""))) + if not extracted_text and isinstance(output.get("text"), str): + extracted_text = output["text"] + elif isinstance(output, list) and len(output) > 0: + # If output is a list, take the first element + extracted_text = str(output[0]) + elif "result" in result: + extracted_text = str(result["result"]) + elif "text" in result: + extracted_text = str(result["text"]) + else: + # Fallback: convert entire response to string + extracted_text = str(result) + + if not extracted_text: + extracted_text = "" + + print(f"[INFO] OCR: Extracted {len(extracted_text)} characters from page {page_num}") + + # Parse model response to extract text and metadata + parsed_text, parsed_metadata = _parse_model_response(extracted_text) + + # Calculate confidence based on response quality + # Create a mock response object for compatibility with confidence calculation + mock_response = type('obj', (object,), { + 'choices': [type('obj', (object,), {'finish_reason': 'stop'})()], + 'usage': type('obj', (object,), {'completion_tokens': len(parsed_text.split())})() + })() + confidence = _calculate_ocr_confidence(mock_response, parsed_text) + + # Determine document type from metadata if available + doc_type = parsed_metadata.get("document_type", "other") + if doc_type == "other" and parsed_metadata.get("title"): + # Try to infer from title + title_lower = parsed_metadata.get("title", "").lower() + if any(kw in title_lower for kw in ["tender", "bid", "quotation"]): + doc_type = "tender" + elif any(kw in title_lower for kw in ["recruitment", "appointment", "vacancy"]): + doc_type = "recruitment" + elif any(kw in title_lower for kw in ["notice", "notification", "circular"]): + doc_type = "notice" + + # Return text and extracted metadata + return { + "doc_type": doc_type, + "confidence": confidence, + "full_text": parsed_text, + "fields": parsed_metadata if parsed_metadata else {} # Model-extracted metadata + } + + except httpx.HTTPStatusError as e: + error_msg = f"HTTP {e.response.status_code}: {e.response.text}" + print(f"[ERROR] OCR API HTTP error for page {page_num}: {error_msg}") + raise RuntimeError(f"OCR API error for page {page_num}: {error_msg}") + except Exception as e: + error_msg = str(e) + print(f"[ERROR] OCR API error for page {page_num}: {error_msg}") + raise RuntimeError(f"OCR API error for page {page_num}: {error_msg}") + + +def _calculate_ocr_confidence(response, extracted_text: str) -> float: + """ + Calculate confidence score based on OCR response quality. + Returns a score from 0-100, with higher scores for better extraction quality. + """ + # Start with a higher base confidence for successful extractions + base_confidence = 92.0 + + # Adjust confidence based on text quality heuristics + text_length = len(extracted_text.strip()) + + if text_length == 0: + return 0.0 + elif text_length < 10: + # Very short text - might be error or empty + return max(30.0, base_confidence - 40.0) + elif text_length < 50: + # Short text - might be incomplete + return max(60.0, base_confidence - 20.0) + elif text_length > 1000: + # Long text - likely good extraction + confidence = min(100.0, base_confidence + 5.0) + elif text_length > 500: + # Medium-long text - good extraction + confidence = min(100.0, base_confidence + 3.0) + else: + confidence = base_confidence + + # Check for structured content (tables, etc.) - indicates good extraction + if '|' in extracted_text and extracted_text.count('|') > 5: + # Table detected - boost confidence significantly + confidence = min(100.0, confidence + 6.0) + + # Check for meaningful content (non-whitespace ratio) + non_whitespace = len([c for c in extracted_text if not c.isspace()]) + if text_length > 0: + content_ratio = non_whitespace / text_length + if content_ratio > 0.85: + # Very high content ratio - excellent extraction + confidence = min(100.0, confidence + 5.0) + elif content_ratio > 0.75: + # High content ratio - good extraction + confidence = min(100.0, confidence + 3.0) + elif content_ratio > 0.6: + # Moderate content ratio - decent extraction + confidence = min(100.0, confidence + 1.0) + elif content_ratio < 0.3: + # Low content ratio - mostly whitespace + confidence = max(60.0, confidence - 15.0) + + # Check for common OCR quality indicators + # Presence of numbers, dates, and structured patterns indicates good extraction + has_numbers = any(c.isdigit() for c in extracted_text) + has_letters = any(c.isalpha() for c in extracted_text) + has_punctuation = any(c in '.,;:!?()[]{}' for c in extracted_text) + + if has_numbers and has_letters and has_punctuation: + # Well-structured text with mixed content - high confidence + confidence = min(100.0, confidence + 2.0) + + # Cap at 100% and ensure minimum quality threshold + return round(min(100.0, max(0.0, confidence)), 1) + + +async def extract_fields_from_document( + file_bytes: bytes, + content_type: str, + filename: str, + key_fields: str = None, +) -> Dict[str, Any]: + """ + Extract text from document using OCR model. + Processes pages separately for better reliability. + Returns text output in full_text, keeps JSON/XML fields empty for now. + """ + # Get raw image bytes for processing + if content_type == "application/pdf" or content_type.endswith("/pdf"): + if not PDF_SUPPORT: + raise RuntimeError("PDF support requires PyMuPDF. Please install it.") + # For PDFs, convert to images + pdf_images = _pdf_to_images(file_bytes) + image_bytes_list = pdf_images + else: + # For regular images, process the file bytes + # Convert to JPEG for consistency + try: + img = Image.open(BytesIO(file_bytes)) + if img.mode != "RGB": + img = img.convert("RGB") + + # Resize if too large (max 1920px on longest side) + max_size = 1920 + w, h = img.size + if w > max_size or h > max_size: + if w > h: + new_w = max_size + new_h = int(h * (max_size / w)) + else: + new_h = max_size + new_w = int(w * (max_size / h)) + img = img.resize((new_w, new_h), Image.LANCZOS) + print(f"[INFO] Resized image from {w}x{h} to {new_w}x{new_h}") + + # Convert to JPEG bytes + img_bytes = BytesIO() + img.save(img_bytes, format="JPEG", quality=95) + image_bytes_list = [img_bytes.getvalue()] + except Exception as e: + # Fallback: use original file bytes + print(f"[WARNING] Could not process image with PIL: {e}. Using original bytes.") + image_bytes_list = [file_bytes] + + total_pages = len(image_bytes_list) + print(f"[INFO] Processing {total_pages} page(s) with OCR model...") + + # Process each page separately + page_results = [] + for page_num, img_bytes in enumerate(image_bytes_list): + print(f"[INFO] Processing page {page_num + 1}/{total_pages}...") + try: + page_result = await _extract_text_with_ocr(img_bytes, page_num + 1, total_pages, None) + page_results.append({ + "page_number": page_num + 1, + "text": page_result.get("full_text", ""), + "fields": page_result.get("fields", {}), + "confidence": page_result.get("confidence", 0), + "doc_type": page_result.get("doc_type", "other"), + }) + print(f"[INFO] Page {page_num + 1} processed successfully") + except Exception as e: + print(f"[ERROR] Failed to process page {page_num + 1}: {e}") + page_results.append({ + "page_number": page_num + 1, + "text": "", + "fields": {}, + "confidence": 0, + "error": str(e) + }) + + # Combine results from all pages + combined_full_text = "\n\n".join([f"=== PAGE {p['page_number']} ===\n\n{p['text']}" for p in page_results if p.get("text")]) + + # Extract user-specified fields if key_fields provided + extracted_fields = {} + if key_fields and key_fields.strip(): + # Parse user input: "Invoice Number, Invoice Date, PO Number" -> ['Invoice Number', 'Invoice Date', 'PO Number'] + field_list = [f.strip() for f in key_fields.split(',') if f.strip()] + if field_list: + print(f"[INFO] Extracting user-specified fields: {field_list}") + + # Format fields as JSON array string for prompt + fields_json = json.dumps(field_list) + custom_prompt = f"Extract the following fields from this image and return as JSON: {fields_json}. Return only a valid JSON object with the field names as keys and their extracted values." + + # Run second OCR pass on first page (usually has most metadata) with custom prompt + if image_bytes_list and len(image_bytes_list) > 0: + try: + print("[INFO] Running second OCR pass for field extraction...") + field_result = await _extract_text_with_ocr(image_bytes_list[0], 1, 1, custom_prompt) + field_text = field_result.get("full_text", "") + + # Try to parse JSON from the response + try: + # Look for JSON in the response + json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', field_text, re.DOTALL) + if json_match: + extracted_fields = json.loads(json_match.group(0)) + print(f"[INFO] Successfully extracted {len(extracted_fields)} fields from second OCR pass") + else: + # Try parsing the entire response as JSON + extracted_fields = json.loads(field_text) + print(f"[INFO] Successfully extracted {len(extracted_fields)} fields from second OCR pass") + except json.JSONDecodeError: + print(f"[WARNING] Could not parse JSON from field extraction response: {field_text[:200]}") + extracted_fields = {} + except Exception as e: + print(f"[WARNING] Field extraction failed: {e}") + extracted_fields = {} + + # Parse each page for tables and structure the output + structured_pages = {} + for page_result in page_results: + if page_result.get("text"): + page_num = page_result.get("page_number", 1) + page_text = page_result.get("text", "") + + # Parse text for tables and structure + parsed_data = _parse_text_with_tables(page_text, {}) + + # Build structured page output (without Fields - moved to root level) + page_key = f"page_{page_num}" + structured_pages[page_key] = { + "text": parsed_data["text"], + "table": parsed_data["table"], + "footer_notes": parsed_data["footer_notes"], + "confidence": page_result.get("confidence", 0), + "doc_type": page_result.get("doc_type", "other") + } + + # If we have structured pages, use them; otherwise keep fields empty + if structured_pages: + # Always return pages with page_X keys (even for single page) + combined_fields = structured_pages + else: + combined_fields = {} + + # Calculate average confidence + confidences = [p.get("confidence", 0) for p in page_results if p.get("confidence", 0) > 0] + avg_confidence = sum(confidences) / len(confidences) if confidences else 0 + + # Determine doc_type from first successful page + doc_type = "other" + for page_result in page_results: + if page_result.get("doc_type") and page_result["doc_type"] != "other": + doc_type = page_result["doc_type"] + break + + # Build return object - add Fields at root level only if extracted_fields is not empty + return_obj = { + "doc_type": doc_type, + "confidence": avg_confidence, + "full_text": combined_full_text, + "fields": combined_fields, # Now contains structured data with tables + "pages": page_results + } + + # Add Fields at root level only if user provided key_fields and extraction succeeded + if extracted_fields: + return_obj["Fields"] = extracted_fields + + return return_obj diff --git a/backend/app/otp_service.py b/backend/app/otp_service.py new file mode 100644 index 0000000000000000000000000000000000000000..1d6f86d5f2686c98b5a8229f97761deb1638b449 --- /dev/null +++ b/backend/app/otp_service.py @@ -0,0 +1,197 @@ +""" +OTP (One-Time Password) service for email-based authentication. +""" +import random +import string +from datetime import datetime, timedelta +from typing import Dict, Optional +from sqlalchemy.orm import Session +from fastapi import HTTPException +from .models import User +from .brevo_service import send_otp_email + +# Store OTPs in memory (in production, use Redis or database) +otp_store: Dict[str, dict] = {} + + +def generate_otp(length: int = 6) -> str: + """ + Generate a random OTP code. + + Args: + length: Length of OTP (default: 6) + + Returns: + Random OTP string + """ + return ''.join(random.choices(string.digits, k=length)) + + +async def request_otp(email: str, db: Session) -> dict: + """ + Generate and send OTP to email using Brevo. + + Args: + email: Email address to send OTP to + db: Database session + + Returns: + Dictionary with success message + """ + # Generate OTP + otp = generate_otp() + expires_at = datetime.utcnow() + timedelta(minutes=10) + + # Store OTP (in production, use Redis or database with TTL) + otp_store[email.lower()] = { + 'otp': otp, + 'expires_at': expires_at, + 'attempts': 0, + 'max_attempts': 5 + } + + # Send OTP via Brevo + try: + await send_otp_email(email, otp) + print(f"[INFO] OTP generated and sent to {email}") + except Exception as e: + # Remove OTP from store if email sending failed + if email.lower() in otp_store: + del otp_store[email.lower()] + raise HTTPException( + status_code=500, + detail=f"Failed to send OTP email: {str(e)}" + ) + + return { + "message": "OTP sent to your email address", + "expires_in_minutes": 10 + } + + +async def verify_otp(email: str, otp: str, db: Session) -> User: + """ + Verify OTP and return/create user. + + Args: + email: Email address + otp: OTP code to verify + db: Database session + + Returns: + User object + + Raises: + HTTPException: If OTP is invalid, expired, or max attempts exceeded + """ + email_lower = email.lower() + stored = otp_store.get(email_lower) + + if not stored: + raise HTTPException( + status_code=400, + detail="OTP not found. Please request a new OTP." + ) + + # Check if expired + if datetime.utcnow() > stored['expires_at']: + del otp_store[email_lower] + raise HTTPException( + status_code=400, + detail="OTP has expired. Please request a new OTP." + ) + + # Check max attempts + if stored['attempts'] >= stored['max_attempts']: + del otp_store[email_lower] + raise HTTPException( + status_code=400, + detail="Maximum verification attempts exceeded. Please request a new OTP." + ) + + # Verify OTP + if stored['otp'] != otp: + stored['attempts'] += 1 + remaining_attempts = stored['max_attempts'] - stored['attempts'] + raise HTTPException( + status_code=400, + detail=f"Invalid OTP. {remaining_attempts} attempt(s) remaining." + ) + + # OTP verified successfully + # Get or create user + user = db.query(User).filter(User.email == email_lower).first() + + if not user: + user = User( + email=email_lower, + auth_method='otp', + email_verified=True + ) + db.add(user) + db.commit() + db.refresh(user) + print(f"[INFO] New user created via OTP: {email_lower}") + + # Enrich contact data from Apollo.io and update Brevo + Monday.com + try: + from .apollo_service import enrich_contact_by_email + from .brevo_service import create_brevo_contact, BREVO_TRIAL_LIST_ID + from .monday_service import create_monday_lead + + # Enrich contact data from Apollo.io + enriched_data = await enrich_contact_by_email(email_lower) + + # Use enriched data if available + first_name = enriched_data.get("first_name") if enriched_data else None + last_name = enriched_data.get("last_name") if enriched_data else None + org_name = enriched_data.get("organization_name") if enriched_data else None + + # Fallback to email domain if Apollo didn't provide organization + if not org_name: + org_domain = email_lower.split('@')[1] if '@' in email_lower else None + org_name = org_domain.split('.')[0].capitalize() if org_domain else None + + # Update Brevo contact with enriched data + await create_brevo_contact( + email=email_lower, + first_name=first_name, + last_name=last_name, + organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None), + phone_number=enriched_data.get("phone_number") if enriched_data else None, + linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None, + title=enriched_data.get("title") if enriched_data else None, + headline=enriched_data.get("headline") if enriched_data else None, + organization_website=enriched_data.get("organization_website") if enriched_data else None, + organization_address=enriched_data.get("organization_address") if enriched_data else None, + list_id=BREVO_TRIAL_LIST_ID + ) + + # Create lead in Monday.com + await create_monday_lead( + email=email_lower, + first_name=first_name, + last_name=last_name, + phone_number=enriched_data.get("phone_number") if enriched_data else None, + linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None, + title=enriched_data.get("title") if enriched_data else None, + headline=enriched_data.get("headline") if enriched_data else None, + organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None), + organization_website=enriched_data.get("organization_website") if enriched_data else None, + organization_address=enriched_data.get("organization_address") if enriched_data else None, + ) + except Exception as e: + # Don't fail user creation if integrations fail + print(f"[WARNING] Failed to enrich/update contact for {email_lower}: {str(e)}") + else: + user.email_verified = True + if user.auth_method != 'otp': + user.auth_method = 'otp' + db.commit() + print(f"[INFO] User verified via OTP: {email_lower}") + + # Remove OTP from store after successful verification + del otp_store[email_lower] + + return user + diff --git a/backend/app/schemas.py b/backend/app/schemas.py index 03a05145dd1ad3871f1336c86ecc552dd80bb1c6..497b12693d75ab7f71e39087ca3a34b9efb08136 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -1,26 +1,26 @@ -from pydantic import BaseModel -from typing import Dict, Optional -from datetime import datetime - - -class ExtractionStage(BaseModel): - time: int - status: str - variation: str - - -class ExtractionRecordBase(BaseModel): - id: int - fileName: str - fileType: str - fileSize: str - extractedAt: datetime - status: str - confidence: float - fieldsExtracted: int - totalTime: int - stages: Dict[str, ExtractionStage] - errorMessage: Optional[str] = None - - class Config: - orm_mode = True +from pydantic import BaseModel +from typing import Dict, Optional +from datetime import datetime + + +class ExtractionStage(BaseModel): + time: int + status: str + variation: str + + +class ExtractionRecordBase(BaseModel): + id: int + fileName: str + fileType: str + fileSize: str + extractedAt: datetime + status: str + confidence: float + fieldsExtracted: int + totalTime: int + stages: Dict[str, ExtractionStage] + errorMessage: Optional[str] = None + + class Config: + from_attributes = True diff --git a/backend/requirements.txt b/backend/requirements.txt index 33ed32b47912b6c3367d57b4460b72c7ae3ecaa5..8919890528a5229d1c219e17630dffa72bb3b834 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,11 +1,15 @@ -fastapi -uvicorn[standard] -python-multipart -pydantic -sqlalchemy -httpx -python-dotenv -pymupdf -pillow -huggingface-hub -openai \ No newline at end of file +fastapi +uvicorn[standard] +python-multipart +pydantic[email] +sqlalchemy +httpx +python-dotenv +pymupdf +pillow +huggingface-hub +openai +firebase-admin +pyjwt +python-jose[cryptography] +email-validator \ No newline at end of file diff --git a/frontend/build-env.sh b/frontend/build-env.sh new file mode 100644 index 0000000000000000000000000000000000000000..2331e08e28a647304e294781a2b07a13bb67aee3 --- /dev/null +++ b/frontend/build-env.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# Script to create .env file from environment variables for Vite build +# This is used in Docker build when environment variables are available + +# Debug: Check if variables are set (without exposing values) +echo "Checking environment variables..." +[ -z "$VITE_FIREBASE_API_KEY" ] && echo "WARNING: VITE_FIREBASE_API_KEY is not set" || echo "✓ VITE_FIREBASE_API_KEY is set" +[ -z "$VITE_FIREBASE_AUTH_DOMAIN" ] && echo "WARNING: VITE_FIREBASE_AUTH_DOMAIN is not set" || echo "✓ VITE_FIREBASE_AUTH_DOMAIN is set" +[ -z "$VITE_FIREBASE_PROJECT_ID" ] && echo "WARNING: VITE_FIREBASE_PROJECT_ID is not set" || echo "✓ VITE_FIREBASE_PROJECT_ID is set" + +cat > .env << EOF +VITE_FIREBASE_API_KEY=${VITE_FIREBASE_API_KEY:-} +VITE_FIREBASE_AUTH_DOMAIN=${VITE_FIREBASE_AUTH_DOMAIN:-} +VITE_FIREBASE_PROJECT_ID=${VITE_FIREBASE_PROJECT_ID:-} +VITE_FIREBASE_STORAGE_BUCKET=${VITE_FIREBASE_STORAGE_BUCKET:-} +VITE_FIREBASE_MESSAGING_SENDER_ID=${VITE_FIREBASE_MESSAGING_SENDER_ID:-} +VITE_FIREBASE_APP_ID=${VITE_FIREBASE_APP_ID:-} +VITE_API_BASE_URL=${VITE_API_BASE_URL:-} +EOF + +echo "Created .env file with environment variables" + diff --git a/frontend/index.html b/frontend/index.html index 34f407823c568c3e8061eb3754302a5a5bd9409a..e38c36f076b1b892bbc195f552ad2f3bb5051df0 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -1,12 +1,13 @@ - - - - - Document Capture Demo - - - -
- - - + + + + + + EZOFIS AI - VRP Document Intelligence + + + +
+ + + diff --git a/frontend/package.json b/frontend/package.json index c37ec074177bab8941fadc9680bf7f0e5fb0ea19..0b03c89dae50e3f0bd0c89dde8f6f000f930933c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,25 +1,26 @@ -{ - "name": "document-capture-demo", - "version": "1.0.0", - "private": true, - "scripts": { - "dev": "vite", - "build": "vite build", - "preview": "vite preview" - }, - "dependencies": { - "react": "^18.3.1", - "react-dom": "^18.3.1", - "react-router-dom": "^6.26.2", - "framer-motion": "^11.0.0", - "lucide-react": "^0.471.0", - "pdfjs-dist": "^4.0.379" - }, - "devDependencies": { - "@vitejs/plugin-react": "^4.1.0", - "autoprefixer": "^10.4.20", - "postcss": "^8.4.47", - "tailwindcss": "^3.4.14", - "vite": "^5.4.0" - } -} +{ + "name": "document-capture-demo", + "version": "1.0.0", + "private": true, + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "dependencies": { + "react": "^18.3.1", + "react-dom": "^18.3.1", + "react-router-dom": "^6.26.2", + "framer-motion": "^11.0.0", + "lucide-react": "^0.471.0", + "pdfjs-dist": "^4.0.379", + "firebase": "^10.7.1" + }, + "devDependencies": { + "@vitejs/plugin-react": "^4.1.0", + "autoprefixer": "^10.4.20", + "postcss": "^8.4.47", + "tailwindcss": "^3.4.14", + "vite": "^5.4.0" + } +} diff --git a/frontend/postcss.config.cjs b/frontend/postcss.config.cjs index 5cbc2c7d8770dd519eeb059f155ee14aa9dc811a..668189e452b05ff430fafa2853ca63fab9d4fbe1 100644 --- a/frontend/postcss.config.cjs +++ b/frontend/postcss.config.cjs @@ -1,6 +1,6 @@ -module.exports = { - plugins: { - tailwindcss: {}, - autoprefixer: {} - } -}; +module.exports = { + plugins: { + tailwindcss: {}, + autoprefixer: {} + } +}; diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index d90e7817bd53adb715c8331207961cdd0bd137ee..d4becad808a1fd73f401c90c9b8b57c7b63ea2ed 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -1,30 +1,106 @@ -// frontend/src/App.jsx - -import React from "react"; -import { Routes, Route } from "react-router-dom"; -import Layout from "./Layout"; -import Dashboard from "./pages/Dashboard"; -import History from "./pages/History"; - -export default function App() { - return ( - - - - - } - /> - - - - } - /> - - ); -} +// frontend/src/App.jsx + +import React, { useEffect } from "react"; +import { Routes, Route, useNavigate, useSearchParams } from "react-router-dom"; +import { AuthProvider, useAuth } from "./contexts/AuthContext"; +import Layout from "./Layout"; +import Dashboard from "./pages/Dashboard"; +import History from "./pages/History"; +import ShareHandler from "./pages/ShareHandler"; +import LoginForm from "./components/auth/LoginForm"; + +// Auth callback handler component +function AuthCallback() { + const [searchParams] = useSearchParams(); + const { handleAuthCallback } = useAuth(); + const navigate = useNavigate(); + + useEffect(() => { + const token = searchParams.get("token"); + if (token) { + handleAuthCallback(token); + navigate("/"); + } else { + navigate("/"); + } + }, [searchParams, handleAuthCallback, navigate]); + + return ( +
+
+

Completing authentication...

+
+
+ ); +} + +// Protected route wrapper +function ProtectedRoute({ children }) { + const { isAuthenticated, loading } = useAuth(); + + if (loading) { + return ( +
+
+
+
+
+

Loading...

+
+
+ ); + } + + if (!isAuthenticated) { + return ; + } + + return children; +} + +function AppRoutes() { + return ( + + } + /> + + + + } + /> + + + + + + } + /> + + + + + + } + /> + + ); +} + +export default function App() { + return ( + + + + ); +} diff --git a/frontend/src/Layout.jsx b/frontend/src/Layout.jsx index ce60455ef9da2fcf14b37ea1a9e5c95c670bbb61..e3bcede85a09ff7559df5742b67d181f5b95bab8 100644 --- a/frontend/src/Layout.jsx +++ b/frontend/src/Layout.jsx @@ -1,143 +1,179 @@ -// frontend/src/Layout.jsx - -import React, { useState } from "react"; -import { Link } from "react-router-dom"; -import { createPageUrl } from "./utils"; -import { - LayoutDashboard, - History as HistoryIcon, - ChevronLeft, - Sparkles, -} from "lucide-react"; -import { cn } from "@/lib/utils"; - -// Import logo - Vite will process this and handle the path correctly -// For production, the logo should be in frontend/public/logo.png -// Vite will copy it to dist/logo.png during build -const logoPath = "/logo.png"; - -export default function Layout({ children, currentPageName }) { - const [collapsed, setCollapsed] = useState(false); - - const navItems = [ - { name: "Dashboard", icon: LayoutDashboard, page: "Dashboard" }, - { name: "History", icon: HistoryIcon, page: "History" }, - ]; - - return ( -
- {/* Sidebar */} - - - {/* Main Content */} -
- {children} -
-
- ); -} +// frontend/src/Layout.jsx + +import React, { useState } from "react"; +import { Link } from "react-router-dom"; +import { createPageUrl } from "./utils"; +import { + LayoutDashboard, + History as HistoryIcon, + ChevronLeft, + Sparkles, + LogOut, + User, +} from "lucide-react"; +import { cn } from "@/lib/utils"; +import { useAuth } from "./contexts/AuthContext"; + +// Import logo - Vite will process this and handle the path correctly +// For production, the logo should be in frontend/public/logo.png +// Vite will copy it to dist/logo.png during build +const logoPath = "/logo.png"; + +export default function Layout({ children, currentPageName }) { + const [collapsed, setCollapsed] = useState(false); + const { user, logout } = useAuth(); + + const navItems = [ + { name: "Dashboard", icon: LayoutDashboard, page: "Dashboard" }, + { name: "History", icon: HistoryIcon, page: "History" }, + ]; + + return ( +
+ {/* Sidebar */} + + + {/* Main Content */} +
+ {children} +
+
+ ); +} diff --git a/frontend/src/components/ErrorBoundary.jsx b/frontend/src/components/ErrorBoundary.jsx index 2d0c746664aea1d08704e25f3998bd365ae49b55..2715a4be35bd023c2eb4b128acc72b66f2f2c468 100644 --- a/frontend/src/components/ErrorBoundary.jsx +++ b/frontend/src/components/ErrorBoundary.jsx @@ -1,72 +1,72 @@ -import React from "react"; - -class ErrorBoundary extends React.Component { - constructor(props) { - super(props); - this.state = { hasError: false, error: null }; - } - - static getDerivedStateFromError(error) { - return { hasError: true, error }; - } - - componentDidCatch(error, errorInfo) { - console.error("Error caught by boundary:", error, errorInfo); - } - - render() { - if (this.state.hasError) { - return ( -
-
-
-
- - - -
-

- Something went wrong -

-

- The application encountered an error. Please refresh the page or contact support if the problem persists. -

- - {process.env.NODE_ENV === "development" && this.state.error && ( -
- - Error Details (Development Only) - -
-                    {this.state.error.toString()}
-                    {this.state.error.stack}
-                  
-
- )} -
-
-
- ); - } - - return this.props.children; - } -} - -export default ErrorBoundary; - +import React from "react"; + +class ErrorBoundary extends React.Component { + constructor(props) { + super(props); + this.state = { hasError: false, error: null }; + } + + static getDerivedStateFromError(error) { + return { hasError: true, error }; + } + + componentDidCatch(error, errorInfo) { + console.error("Error caught by boundary:", error, errorInfo); + } + + render() { + if (this.state.hasError) { + return ( +
+
+
+
+ + + +
+

+ Something went wrong +

+

+ The application encountered an error. Please refresh the page or contact support if the problem persists. +

+ + {process.env.NODE_ENV === "development" && this.state.error && ( +
+ + Error Details (Development Only) + +
+                    {this.state.error.toString()}
+                    {this.state.error.stack}
+                  
+
+ )} +
+
+
+ ); + } + + return this.props.children; + } +} + +export default ErrorBoundary; + diff --git a/frontend/src/components/ExportButtons.jsx b/frontend/src/components/ExportButtons.jsx index cad96964f454e8abd4c0f81062293c16cac45b9f..c267d7c26894c930258557d25684e6d877960d2b 100644 --- a/frontend/src/components/ExportButtons.jsx +++ b/frontend/src/components/ExportButtons.jsx @@ -1,320 +1,692 @@ -import React, { useState } from "react"; -import { motion, AnimatePresence } from "framer-motion"; -import { - Download, - Braces, - FileCode2, - Check, - Share2, - FileJson, - Copy, - Mail, - Link2, -} from "lucide-react"; -import { Button } from "@/components/ui/button"; -import { - DropdownMenu, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuSeparator, - DropdownMenuTrigger, -} from "@/components/ui/dropdown-menu"; -import { cn } from "@/lib/utils"; - -// Helper functions from ExtractionOutput -function prepareFieldsForOutput(fields, format = "json") { - if (!fields || typeof fields !== "object") { - return fields; - } - - const output = { ...fields }; - - // Remove full_text from top-level if pages array exists (to avoid duplication) - if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { - delete output.full_text; - - // Clean up each page: remove full_text from page.fields (it duplicates page.text) - output.pages = output.pages.map(page => { - const cleanedPage = { ...page }; - if (cleanedPage.fields && typeof cleanedPage.fields === "object") { - const cleanedFields = { ...cleanedPage.fields }; - // Remove full_text from page fields (duplicates page.text) - delete cleanedFields.full_text; - cleanedPage.fields = cleanedFields; - } - return cleanedPage; - }); - } - - // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) - if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { - // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) - const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text")); - - output.pages.forEach((page, idx) => { - const pageNum = page.page_number || idx + 1; - const pageFields = page.fields || {}; - - // Remove duplicate fields from page.fields: - // 1. Remove full_text (duplicates page.text) - // 2. Remove fields that match top-level fields (already shown at root) - const cleanedPageFields = {}; - for (const [key, value] of Object.entries(pageFields)) { - // Skip full_text and fields that match top-level exactly - if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { - cleanedPageFields[key] = value; - } - } - - const pageObj = { - text: page.text || "", - confidence: page.confidence || 0, - doc_type: page.doc_type || "other" - }; - - // Only add fields if there are unique page-specific fields - if (Object.keys(cleanedPageFields).length > 0) { - pageObj.fields = cleanedPageFields; - } - - output[`page_${pageNum}`] = pageObj; - }); - // Remove pages array - we now have page_1, page_2, etc. as separate fields - delete output.pages; - } - - return output; -} - -function escapeXML(str) { - return str - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'"); -} - -function objectToXML(obj, rootName = "extraction") { - // Prepare fields - remove full_text if pages exist - const preparedObj = prepareFieldsForOutput(obj, "xml"); - - let xml = `\n<${rootName}>\n`; - - const convert = (obj, indent = " ") => { - for (const [key, value] of Object.entries(obj)) { - if (value === null || value === undefined) continue; - - // Skip full_text if pages exist (already handled in prepareFieldsForOutput) - if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { - continue; - } - - if (Array.isArray(value)) { - value.forEach((item) => { - xml += `${indent}<${key}>\n`; - if (typeof item === "object") { - convert(item, indent + " "); - } else { - xml += `${indent} ${escapeXML(String(item))}\n`; - } - xml += `${indent}\n`; - }); - } else if (typeof value === "object") { - xml += `${indent}<${key}>\n`; - convert(value, indent + " "); - xml += `${indent}\n`; - } else { - xml += `${indent}<${key}>${escapeXML(String(value))}\n`; - } - } - }; - - convert(preparedObj); - xml += ``; - return xml; -} - -export default function ExportButtons({ isComplete, extractionResult }) { - const [downloading, setDownloading] = useState(null); - const [copied, setCopied] = useState(false); - - const handleDownload = (format) => { - if (!extractionResult || !extractionResult.fields) { - console.error("No extraction data available"); - return; - } - - setDownloading(format); - - try { - const fields = extractionResult.fields; - let content = ""; - let filename = ""; - let mimeType = ""; - - if (format === "json") { - const preparedFields = prepareFieldsForOutput(fields, "json"); - content = JSON.stringify(preparedFields, null, 2); - filename = `extraction_${new Date().toISOString().split('T')[0]}.json`; - mimeType = "application/json"; - } else if (format === "xml") { - content = objectToXML(fields); - filename = `extraction_${new Date().toISOString().split('T')[0]}.xml`; - mimeType = "application/xml"; - } - - // Create blob and download - const blob = new Blob([content], { type: mimeType }); - const url = URL.createObjectURL(blob); - const link = document.createElement("a"); - link.href = url; - link.download = filename; - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(url); - - setDownloading(null); - } catch (error) { - console.error("Download error:", error); - setDownloading(null); - } - }; - - const handleCopyLink = () => { - setCopied(true); - setTimeout(() => setCopied(false), 2000); - }; - - if (!isComplete) return null; - - return ( - - {/* JSON Download */} - - - {/* XML Download */} - - - {/* More Options Dropdown */} - - - - - - - {copied ? ( - - ) : ( - - )} - {copied ? "Link copied!" : "Copy share link"} - - - - Copy to clipboard - - - - - Send via email - - - - Export to Google Sheets - - - - - ); -} +import React, { useState } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { + Download, + Braces, + FileCode2, + Check, + Share2, + FileText, + Link2, + Mail, +} from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuSeparator, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; +import { cn } from "@/lib/utils"; +import ShareModal from "@/components/ShareModal"; +import ShareLinkModal from "@/components/ShareLinkModal"; +import { shareExtraction, createShareLink } from "@/services/api"; + +// Helper functions from ExtractionOutput +function prepareFieldsForOutput(fields, format = "json") { + if (!fields || typeof fields !== "object") { + return fields; + } + + const output = { ...fields }; + + // Extract Fields from root level if it exists + const rootFields = output.Fields; + // Remove Fields from output temporarily (will be added back at top) + delete output.Fields; + + // Remove full_text from top-level if pages array exists (to avoid duplication) + if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { + delete output.full_text; + + // Clean up each page: remove full_text from page.fields (it duplicates page.text) + output.pages = output.pages.map(page => { + const cleanedPage = { ...page }; + if (cleanedPage.fields && typeof cleanedPage.fields === "object") { + const cleanedFields = { ...cleanedPage.fields }; + // Remove full_text from page fields (duplicates page.text) + delete cleanedFields.full_text; + cleanedPage.fields = cleanedFields; + } + return cleanedPage; + }); + } + + // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) + if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { + // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) + const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields")); + + output.pages.forEach((page, idx) => { + const pageNum = page.page_number || idx + 1; + const pageFields = page.fields || {}; + + // Remove duplicate fields from page.fields: + // 1. Remove full_text (duplicates page.text) + // 2. Remove fields that match top-level fields (already shown at root) + const cleanedPageFields = {}; + for (const [key, value] of Object.entries(pageFields)) { + // Skip full_text and fields that match top-level exactly + if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { + cleanedPageFields[key] = value; + } + } + + const pageObj = { + text: page.text || "", + confidence: page.confidence || 0, + doc_type: page.doc_type || "other" + }; + + // Add table and footer_notes if they exist + if (page.table && Array.isArray(page.table) && page.table.length > 0) { + pageObj.table = page.table; + } + if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) { + pageObj.footer_notes = page.footer_notes; + } + + // Only add fields if there are unique page-specific fields + if (Object.keys(cleanedPageFields).length > 0) { + pageObj.fields = cleanedPageFields; + } + + output[`page_${pageNum}`] = pageObj; + }); + // Remove pages array - we now have page_1, page_2, etc. as separate fields + delete output.pages; + } + + // Handle page_X structure (from backend) - remove Fields from page objects if they exist + if (output && typeof output === "object") { + const pageKeys = Object.keys(output).filter(k => k.startsWith("page_")); + for (const pageKey of pageKeys) { + const pageData = output[pageKey]; + if (pageData && typeof pageData === "object") { + // Remove Fields from page objects (it's now at root level) + delete pageData.Fields; + delete pageData.metadata; + } + } + } + + // Rebuild output with Fields at the top (only if it exists and is not empty) + const finalOutput = {}; + if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) { + finalOutput.Fields = rootFields; + } + + // Add all other keys + Object.keys(output).forEach(key => { + finalOutput[key] = output[key]; + }); + + return finalOutput; +} + +function escapeXML(str) { + return str + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +function objectToXML(obj, rootName = "extraction") { + // Prepare fields - remove full_text if pages exist + const preparedObj = prepareFieldsForOutput(obj, "xml"); + + let xml = `\n<${rootName}>\n`; + + const convert = (obj, indent = " ") => { + for (const [key, value] of Object.entries(obj)) { + if (value === null || value === undefined) continue; + + // Skip full_text if pages exist (already handled in prepareFieldsForOutput) + if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { + continue; + } + + if (Array.isArray(value)) { + value.forEach((item) => { + xml += `${indent}<${key}>\n`; + if (typeof item === "object") { + convert(item, indent + " "); + } else { + xml += `${indent} ${escapeXML(String(item))}\n`; + } + xml += `${indent}\n`; + }); + } else if (typeof value === "object") { + xml += `${indent}<${key}>\n`; + convert(value, indent + " "); + xml += `${indent}\n`; + } else { + xml += `${indent}<${key}>${escapeXML(String(value))}\n`; + } + } + }; + + convert(preparedObj); + xml += ``; + return xml; +} + +export default function ExportButtons({ isComplete, extractionResult }) { + const [downloading, setDownloading] = useState(null); + const [copied, setCopied] = useState(false); + const [isShareModalOpen, setIsShareModalOpen] = useState(false); + const [isShareLinkModalOpen, setIsShareLinkModalOpen] = useState(false); + const [shareLink, setShareLink] = useState(""); + const [isGeneratingLink, setIsGeneratingLink] = useState(false); + + // Helper function to extract text from fields (same as in ExtractionOutput) + const extractTextFromFields = (fields) => { + if (!fields || typeof fields !== "object") { + return ""; + } + + // Check for page_X structure first (preferred format) + const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_")); + if (pageKeys.length > 0) { + // Get text from first page (or combine all pages) + const pageTexts = pageKeys.map(key => { + const page = fields[key]; + if (page && page.text) { + return page.text; + } + return ""; + }).filter(text => text); + + if (pageTexts.length > 0) { + return pageTexts.join("\n\n"); + } + } + + // Fallback to full_text + if (fields.full_text) { + return fields.full_text; + } + + return ""; + }; + + // Helper function to escape HTML + const escapeHtml = (text) => { + if (!text) return ''; + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; + }; + + // Helper function to convert pipe-separated tables to HTML tables + const convertPipeTablesToHTML = (text) => { + if (!text) return text; + + const lines = text.split('\n'); + const result = []; + let i = 0; + + while (i < lines.length) { + const line = lines[i]; + + // Check if this line looks like a table row (has multiple pipes) + if (line.includes('|') && line.split('|').length >= 3) { + // Check if it's a separator line (only |, -, :, spaces) + const isSeparator = /^[\s|\-:]+$/.test(line.trim()); + + if (!isSeparator) { + // Start of a table - collect all table rows + const tableRows = []; + let j = i; + + // Collect header row + const headerLine = lines[j]; + const headerCells = headerLine.split('|').map(cell => cell.trim()).filter(cell => cell || cell === ''); + // Remove empty cells at start/end + if (headerCells.length > 0 && !headerCells[0]) headerCells.shift(); + if (headerCells.length > 0 && !headerCells[headerCells.length - 1]) headerCells.pop(); + + if (headerCells.length >= 2) { + tableRows.push(headerCells); + j++; + + // Skip separator line if present + if (j < lines.length && /^[\s|\-:]+$/.test(lines[j].trim())) { + j++; + } + + // Collect data rows + while (j < lines.length) { + const rowLine = lines[j]; + if (!rowLine.trim()) break; // Empty line ends table + + // Check if it's still a table row + if (rowLine.includes('|') && rowLine.split('|').length >= 2) { + const isRowSeparator = /^[\s|\-:]+$/.test(rowLine.trim()); + if (!isRowSeparator) { + const rowCells = rowLine.split('|').map(cell => cell.trim()); + // Remove empty cells at start/end + if (rowCells.length > 0 && !rowCells[0]) rowCells.shift(); + if (rowCells.length > 0 && !rowCells[rowCells.length - 1]) rowCells.pop(); + tableRows.push(rowCells); + j++; + } else { + j++; + } + } else { + break; // Not a table row anymore + } + } + + // Convert to HTML table + if (tableRows.length > 0) { + let htmlTable = '\n\n'; + + // Header row + tableRows[0].forEach(cell => { + htmlTable += ``; + }); + htmlTable += '\n\n\n'; + + // Data rows + for (let rowIdx = 1; rowIdx < tableRows.length; rowIdx++) { + htmlTable += ''; + tableRows[rowIdx].forEach((cell, colIdx) => { + // Use header cell count to ensure alignment + const cellContent = cell || ''; + htmlTable += ``; + }); + htmlTable += '\n'; + } + + htmlTable += '\n
${escapeHtml(cell)}
${escapeHtml(cellContent)}
'; + result.push(htmlTable); + i = j; + continue; + } + } + } + } + + // Not a table row, add as-is + result.push(line); + i++; + } + + return result.join('\n'); + }; + + // Helper function to render markdown to HTML (same as in ExtractionOutput) + const renderMarkdownToHTML = (text) => { + if (!text) return ""; + + let html = text; + + // FIRST: Convert pipe-separated tables to HTML tables + html = convertPipeTablesToHTML(html); + + // Convert LaTeX-style superscripts/subscripts FIRST + html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '$1'); + html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '$1'); + html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '$1'); + html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '$1'); + + // Protect HTML table blocks + const htmlBlocks = []; + let htmlBlockIndex = 0; + + html = html.replace(//gi, (match) => { + const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`; + htmlBlocks[htmlBlockIndex] = match; + htmlBlockIndex++; + return placeholder; + }); + + // Convert markdown headers + html = html.replace(/^### (.*$)/gim, '

$1

'); + html = html.replace(/^## (.*$)/gim, '

$1

'); + html = html.replace(/^# (.*$)/gim, '

$1

'); + + // Convert markdown bold/italic + html = html.replace(/\*\*(.*?)\*\*/g, '$1'); + html = html.replace(/\*(.*?)\*/g, '$1'); + + // Convert markdown links + html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1'); + + // Process line breaks + const parts = html.split(/(__HTML_BLOCK_\d+__)/); + const processedParts = parts.map((part) => { + if (part.match(/^__HTML_BLOCK_\d+__$/)) { + const blockIndex = parseInt(part.match(/\d+/)[0]); + return htmlBlocks[blockIndex]; + } else { + let processed = part; + processed = processed.replace(/\n\n+/g, '

'); + processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1
$2'); + if (processed.trim() && !processed.trim().startsWith('<')) { + processed = '

' + processed + '

'; + } + return processed; + } + }); + + html = processedParts.join(''); + html = html.replace(/

<\/p>/g, ''); + html = html.replace(/

\s*
\s*<\/p>/g, ''); + html = html.replace(/

\s*<\/p>/g, ''); + + return html; + }; + + const handleDownload = async (format) => { + if (!extractionResult || !extractionResult.fields) { + console.error("No extraction data available"); + return; + } + + setDownloading(format); + + try { + const fields = extractionResult.fields; + let content = ""; + let filename = ""; + let mimeType = ""; + + if (format === "json") { + const preparedFields = prepareFieldsForOutput(fields, "json"); + content = JSON.stringify(preparedFields, null, 2); + filename = `extraction_${new Date().toISOString().split('T')[0]}.json`; + mimeType = "application/json"; + } else if (format === "xml") { + content = objectToXML(fields); + filename = `extraction_${new Date().toISOString().split('T')[0]}.xml`; + mimeType = "application/xml"; + } else if (format === "docx") { + // For DOCX, create a Word-compatible HTML document that preserves layout + // Extract text and convert to HTML (same as text viewer) + const textContent = extractTextFromFields(fields); + const htmlContent = renderMarkdownToHTML(textContent); + + // Create a Word-compatible HTML document with proper MIME type + // Word can open HTML files with .docx extension if we use the right MIME type + const wordHTML = ` + + + + + + + + Document Extraction + + + +${htmlContent} + +`; + + content = wordHTML; + filename = `extraction_${new Date().toISOString().split('T')[0]}.doc`; + mimeType = "application/msword"; + } + + // Create blob and download + const blob = new Blob([content], { type: mimeType }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = filename; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(url); + + setDownloading(null); + } catch (error) { + console.error("Download error:", error); + setDownloading(null); + } + }; + + const handleCopyLink = async () => { + if (!extractionResult?.id) return; + + setIsGeneratingLink(true); + setIsShareLinkModalOpen(true); + setShareLink(""); + + try { + const result = await createShareLink(extractionResult.id); + if (result.success && result.share_link) { + setShareLink(result.share_link); + } else { + throw new Error("Failed to generate share link"); + } + } catch (err) { + console.error("Failed to create share link:", err); + setShareLink(""); + // Still show modal but with error state + } finally { + setIsGeneratingLink(false); + } + }; + + const handleShare = async (extractionId, recipientEmail) => { + await shareExtraction(extractionId, recipientEmail); + }; + + if (!isComplete) return null; + + return ( + + {/* Export Options Dropdown */} + + + + + + setIsShareModalOpen(true)} + > + + Share output + + + + Copy share link + + + handleDownload("docx")} + disabled={downloading === "docx"} + > + {downloading === "docx" ? ( + + + + ) : ( + + )} + Download Docx + + handleDownload("json")} + disabled={downloading === "json"} + > + {downloading === "json" ? ( + + + + ) : ( + + )} + Download JSON + + handleDownload("xml")} + disabled={downloading === "xml"} + > + {downloading === "xml" ? ( + + + + ) : ( + + )} + Download XML + + + + + {/* Share Modal */} + setIsShareModalOpen(false)} + onShare={handleShare} + extractionId={extractionResult?.id} + /> + + {/* Share Link Modal */} + { + setIsShareLinkModalOpen(false); + setShareLink(""); + }} + shareLink={shareLink} + isLoading={isGeneratingLink} + /> + + ); +} diff --git a/frontend/src/components/ShareLinkModal.jsx b/frontend/src/components/ShareLinkModal.jsx new file mode 100644 index 0000000000000000000000000000000000000000..3c6dfe50920582357a4ac425286c2832cd94867d --- /dev/null +++ b/frontend/src/components/ShareLinkModal.jsx @@ -0,0 +1,141 @@ +import React, { useState, useEffect } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { X, Copy, Check, Loader2 } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; + +export default function ShareLinkModal({ isOpen, onClose, shareLink, isLoading }) { + const [copied, setCopied] = useState(false); + + useEffect(() => { + if (!isOpen) { + setCopied(false); + } + }, [isOpen]); + + const handleCopy = async () => { + if (!shareLink) return; + + try { + await navigator.clipboard.writeText(shareLink); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + } catch (err) { + // Fallback for older browsers + const textArea = document.createElement("textarea"); + textArea.value = shareLink; + textArea.style.position = "fixed"; + textArea.style.opacity = "0"; + document.body.appendChild(textArea); + textArea.select(); + try { + document.execCommand("copy"); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + } catch (fallbackErr) { + console.error("Failed to copy:", fallbackErr); + } + document.body.removeChild(textArea); + } + }; + + if (!isOpen) return null; + + return ( + +

+ {/* Backdrop */} + + + {/* Modal */} + e.stopPropagation()} + > + {/* Header */} +
+

Copy Share Link

+ +
+ + {/* Content */} +
+ {isLoading ? ( +
+ +

Generating share link...

+
+ ) : shareLink ? ( +
+
+ +
+ + +
+
+

+ Share this link with anyone you want to give access to this extraction. They'll need to sign in to view it. +

+
+ ) : ( +
+

No share link available

+
+ )} + +
+ +
+
+
+
+ + ); +} + diff --git a/frontend/src/components/ShareModal.jsx b/frontend/src/components/ShareModal.jsx new file mode 100644 index 0000000000000000000000000000000000000000..5fc2f42803dc05d3b5370a36a5f2a092f49f1155 --- /dev/null +++ b/frontend/src/components/ShareModal.jsx @@ -0,0 +1,197 @@ +import React, { useState } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { X, Mail, Send, Loader2 } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; + +export default function ShareModal({ isOpen, onClose, onShare, extractionId }) { + const [email, setEmail] = useState(""); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(""); + const [success, setSuccess] = useState(false); + const [successMessage, setSuccessMessage] = useState(""); + + const handleSubmit = async (e) => { + e.preventDefault(); + setError(""); + setSuccess(false); + + // Parse and validate multiple emails (comma or semicolon separated) + if (!email.trim()) { + setError("Please enter at least one recipient email address"); + return; + } + + // Split by comma or semicolon, trim each email, and filter out empty strings + const emailList = email + .split(/[,;]/) + .map((e) => e.trim()) + .filter((e) => e.length > 0); + + if (emailList.length === 0) { + setError("Please enter at least one recipient email address"); + return; + } + + // Validate each email + const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; + const invalidEmails = emailList.filter((e) => !emailRegex.test(e)); + + if (invalidEmails.length > 0) { + setError(`Invalid email address(es): ${invalidEmails.join(", ")}`); + return; + } + + setIsLoading(true); + try { + const result = await onShare(extractionId, emailList); + setSuccessMessage(result?.message || `Successfully shared with ${emailList.length} recipient(s)`); + setSuccess(true); + setEmail(""); + // Close modal after 2 seconds + setTimeout(() => { + setSuccess(false); + setSuccessMessage(""); + onClose(); + }, 2000); + } catch (err) { + setError(err.message || "Failed to share extraction. Please try again."); + } finally { + setIsLoading(false); + } + }; + + const handleClose = () => { + if (!isLoading) { + setEmail(""); + setError(""); + setSuccess(false); + onClose(); + } + }; + + if (!isOpen) return null; + + return ( + +
+ {/* Backdrop */} + + + {/* Modal */} + e.stopPropagation()} + > + {/* Header */} +
+

Share Output

+ +
+ + {/* Content */} +
+ {success ? ( + +
+ +
+

+ Share Sent Successfully! +

+

+ {successMessage || "The recipient(s) will receive an email with a link to view the extraction."} +

+
+ ) : ( +
+
+ +

+ Separate multiple emails with commas or semicolons +

+
+ + setEmail(e.target.value)} + placeholder="Enter email addresses (comma or semicolon separated)" + className="pl-10 h-12 rounded-xl border-slate-200 focus:border-indigo-500 focus:ring-indigo-500" + disabled={isLoading} + autoFocus + /> +
+ {error && ( + + {error} + + )} +
+ +
+ + +
+
+ )} +
+
+
+
+ ); +} + diff --git a/frontend/src/components/auth/LoginForm.jsx b/frontend/src/components/auth/LoginForm.jsx new file mode 100644 index 0000000000000000000000000000000000000000..903dd50eb5b0ec77276cd942f20b3a4e13795845 --- /dev/null +++ b/frontend/src/components/auth/LoginForm.jsx @@ -0,0 +1,512 @@ +import React, { useState } from "react"; +import { motion } from "framer-motion"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Separator } from "@/components/ui/separator"; +import { + Zap, + Target, + Upload, + CheckCircle2, + ArrowRight, + Mail, + Sparkles, + Shield, + Globe, + AlertCircle, + Loader2, +} from "lucide-react"; +import { useAuth } from "@/contexts/AuthContext"; + +export default function LoginForm() { + const { firebaseLogin, requestOTP, verifyOTP } = useAuth(); + const [email, setEmail] = useState(""); + const [showOtp, setShowOtp] = useState(false); + const [otp, setOtp] = useState(["", "", "", "", "", ""]); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(""); + + // Business email validation + const PERSONAL_EMAIL_DOMAINS = [ + "gmail.com", + "yahoo.com", + "hotmail.com", + "outlook.com", + "aol.com", + "icloud.com", + "mail.com", + "protonmail.com", + "yandex.com", + "zoho.com", + "gmx.com", + "live.com", + "msn.com", + ]; + + const isBusinessEmail = (email) => { + if (!email || !email.includes("@")) return false; + const domain = email.split("@")[1].toLowerCase(); + return !PERSONAL_EMAIL_DOMAINS.includes(domain); + }; + + const handleGoogleLogin = async () => { + setLoading(true); + setError(""); + try { + await firebaseLogin(); + } catch (err) { + setError(err.message || "Failed to sign in with Google"); + } finally { + setLoading(false); + } + }; + + const handleEmailSubmit = async (e) => { + e.preventDefault(); + setLoading(true); + setError(""); + + if (!email) { + setError("Please enter your email address"); + setLoading(false); + return; + } + + if (!isBusinessEmail(email)) { + setError("Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, etc.) are not permitted."); + setLoading(false); + return; + } + + try { + await requestOTP(email); + setShowOtp(true); + } catch (err) { + setError(err.message || "Failed to send OTP"); + } finally { + setLoading(false); + } + }; + + const handleOtpChange = (index, value) => { + if (value.length <= 1 && /^\d*$/.test(value)) { + const newOtp = [...otp]; + newOtp[index] = value; + setOtp(newOtp); + setError(""); + + // Auto-focus next input + if (value && index < 5) { + const nextInput = document.getElementById(`otp-${index + 1}`); + nextInput?.focus(); + } + } + }; + + const handleOtpPaste = (e, startIndex = 0) => { + e.preventDefault(); + const pastedData = e.clipboardData.getData("text"); + // Extract only digits from pasted content + const digits = pastedData.replace(/\D/g, "").slice(0, 6); + + if (digits.length > 0) { + const newOtp = [...otp]; + // Fill the OTP array with pasted digits starting from the current field + for (let i = 0; i < digits.length && (startIndex + i) < 6; i++) { + newOtp[startIndex + i] = digits[i]; + } + setOtp(newOtp); + setError(""); + + // Focus on the next empty input or the last input if all are filled + const nextEmptyIndex = Math.min(startIndex + digits.length, 5); + const nextInput = document.getElementById(`otp-${nextEmptyIndex}`); + nextInput?.focus(); + } + }; + + const handleOtpKeyDown = (index, e) => { + if (e.key === "Backspace" && !otp[index] && index > 0) { + const prevInput = document.getElementById(`otp-${index - 1}`); + prevInput?.focus(); + } + }; + + const handleOtpVerify = async (e) => { + e.preventDefault(); + setLoading(true); + setError(""); + + const otpString = otp.join(""); + if (otpString.length !== 6) { + setError("Please enter a valid 6-digit OTP"); + setLoading(false); + return; + } + + try { + await verifyOTP(email, otpString); + // Success - user will be redirected by AuthContext + } catch (err) { + setError(err.message || "Invalid OTP. Please try again."); + setOtp(["", "", "", "", "", ""]); + } finally { + setLoading(false); + } + }; + + const features = [ + { + icon: Zap, + title: "Lightning Fast", + description: "Process documents in seconds and get outputs for ERP ingestion", + color: "text-amber-500", + bg: "bg-amber-50", + }, + { + icon: Target, + title: "100% Accuracy", + description: "Industry-leading extraction with Visual Reasoning Processor", + color: "text-emerald-500", + bg: "bg-emerald-50", + }, + { + icon: Globe, + title: "Any Format, Any Language", + description: "PDF, images, scanned docs — multi-lingual support included", + color: "text-blue-500", + bg: "bg-blue-50", + }, + ]; + + const supportedFormats = [ + { ext: "PDF", color: "bg-red-500" }, + { ext: "PNG", color: "bg-blue-500" }, + { ext: "JPG", color: "bg-green-500" }, + { ext: "TIFF", color: "bg-purple-500" }, + ]; + + return ( +
+ {/* Left Side - Product Showcase */} +
+ {/* Background Elements */} +
+
+ + {/* Logo & Brand */} + +
+
+ EZOFIS AI Logo { + // Fallback: hide image if logo not found + e.target.style.display = 'none'; + }} + /> +
+
+

EZOFISOCR

+

VRP Intelligence

+
+
+
+ + {/* Main Content */} + +
+

+ Pure Agentic + + Document Intelligence + +

+

+ Deterministic, layout-aware extraction (without LLM) using our proprietary{" "} + Visual Reasoning Processor (VRP) +

+
+ + {/* Product Preview Card */} + +
+
+ +
+

Drop a document to extract data

+

Invoices, purchase orders, delivery notes, receipts, and operational documents

+ +
+ {supportedFormats.map((format, i) => ( + + {format.ext} + + ))} +
+
+ +
+
+
+ Ready to extract +
+
+ + 99.8% Accuracy +
+
+ + + {/* Features */} +
+ {features.map((feature, index) => ( + +
+ +
+
+

{feature.title}

+

{feature.description}

+
+
+ ))} +
+ + + {/* Trust Badge */} + + + Enterprise-grade security • SOC 2 Compliant • GDPR Ready + +
+ + {/* Right Side - Sign In Form */} +
+ + {/* Mobile Logo */} +
+
+ EZOFIS AI Logo { + // Fallback: hide image if logo not found + e.target.style.display = 'none'; + }} + /> +
+
+

EZOFISOCR

+

VRP Intelligence

+
+
+ +
+
+

+ {showOtp ? "Enter verification code" : "Secure Access"} +

+

+ {showOtp ? `We sent a code to ${email}` : "Access your document intelligence workspace"} +

+
+ + {/* Error Message */} + {error && ( + + +

{error}

+
+ )} + + {!showOtp ? ( + <> + {/* Google Sign In */} + + +
+ + + or continue with email + +
+ + {/* Email Input */} +
+
+ + { + setEmail(e.target.value); + setError(""); + }} + className="h-12 pl-12 text-base border-slate-200 focus:border-blue-500 focus:ring-blue-500" + /> +
+ +
+ + ) : ( + /* OTP Input */ +
+
+ {otp.map((digit, index) => ( + handleOtpChange(index, e.target.value)} + onKeyDown={(e) => handleOtpKeyDown(index, e)} + onPaste={(e) => handleOtpPaste(e, index)} + className="w-12 h-14 text-center text-xl font-semibold border-slate-200 focus:border-blue-500 focus:ring-blue-500" + /> + ))} +
+ + + + +
+ )} + + {/* Notice */} +
+
+ + Only business email addresses are allowed +
+

+ By signing in, you agree to our{" "} + + Terms of Service + {" "} + and{" "} + + Privacy Policy + +

+
+
+ + {/* Mobile Features */} +
+ {features.map((feature) => ( +
+
+ +
+ {feature.title} +
+ ))} +
+
+
+
+ ); +} diff --git a/frontend/src/components/ocr/DocumentPreview.jsx b/frontend/src/components/ocr/DocumentPreview.jsx index 6d94ef19e1125d0124495119f42bda0b1e90821a..cfd2da23a141a7c60b73ce616c0b79820a95877a 100644 --- a/frontend/src/components/ocr/DocumentPreview.jsx +++ b/frontend/src/components/ocr/DocumentPreview.jsx @@ -1,236 +1,229 @@ -import React, { useState, useEffect, useRef } from "react"; -import { motion } from "framer-motion"; -import { FileText, ZoomIn, ZoomOut, RotateCw, Maximize2 } from "lucide-react"; -import { Button } from "@/components/ui/button"; - -export default function DocumentPreview({ file, isProcessing }) { - const [previewUrls, setPreviewUrls] = useState([]); - const [zoom, setZoom] = useState(100); - const [rotation, setRotation] = useState(0); - const objectUrlsRef = useRef([]); - - useEffect(() => { - if (!file) { - // Cleanup previous URLs - objectUrlsRef.current.forEach((url) => { - if (url && url.startsWith("blob:")) { - URL.revokeObjectURL(url); - } - }); - objectUrlsRef.current = []; - setPreviewUrls([]); - return; - } - - const loadPreview = async () => { - const urls = []; - const newObjectUrls = []; - - // Check if it's a PDF - if (file.type === "application/pdf" || file.name?.toLowerCase().endsWith(".pdf")) { - try { - // Use pdf.js to render PDF pages - const pdfjsLib = await import("pdfjs-dist"); - - // Configure worker - use jsdelivr CDN which is more reliable - // This will use the same version as the installed package - const version = pdfjsLib.version || "4.0.379"; - pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@${version}/build/pdf.worker.min.mjs`; - - const arrayBuffer = await file.arrayBuffer(); - const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise; - const numPages = pdf.numPages; - - for (let pageNum = 1; pageNum <= numPages; pageNum++) { - const page = await pdf.getPage(pageNum); - const viewport = page.getViewport({ scale: 2.0 }); - - const canvas = document.createElement("canvas"); - const context = canvas.getContext("2d"); - canvas.height = viewport.height; - canvas.width = viewport.width; - - await page.render({ - canvasContext: context, - viewport: viewport, - }).promise; - - urls.push(canvas.toDataURL("image/jpeg", 0.95)); - } - } catch (error) { - console.error("Error loading PDF:", error); - // Fallback: show error message - urls.push(null); - } - } else { - // For images, create object URL - const url = URL.createObjectURL(file); - urls.push(url); - newObjectUrls.push(url); - } - - // Cleanup old object URLs - objectUrlsRef.current.forEach((url) => { - if (url && url.startsWith("blob:")) { - URL.revokeObjectURL(url); - } - }); - objectUrlsRef.current = newObjectUrls; - setPreviewUrls(urls); - }; - - loadPreview(); - - // Cleanup function - revoke object URLs when component unmounts or file changes - return () => { - objectUrlsRef.current.forEach((url) => { - if (url && url.startsWith("blob:")) { - URL.revokeObjectURL(url); - } - }); - objectUrlsRef.current = []; - }; - }, [file]); - - return ( -
- {/* Header */} -
-
-
- -
-
-

Document Preview

-

{file?.name || "No file selected"}

-
-
- - {file && ( -
- - {zoom}% - -
- - -
- )} -
- - {/* Preview Area */} -
- {!file ? ( -
-
-
- -
-

Upload a document to preview

-
-
- ) : previewUrls.length === 0 ? ( -
-
-
- -
-

Loading preview...

-
-
- ) : ( -
- {previewUrls.map((url, index) => ( - - {url ? ( - {`Page - ) : ( -
-

Unable to load preview

-
- )} - - {/* Processing overlay */} - {isProcessing && ( - - - - )} - - {/* Page number */} - {previewUrls.length > 1 && ( -
- Page {index + 1} -
- )} -
- ))} -
- )} -
-
- ); -} +import React, { useState, useEffect, useRef } from "react"; +import { motion } from "framer-motion"; +import { FileText, ZoomIn, ZoomOut, RotateCw } from "lucide-react"; +import { Button } from "@/components/ui/button"; + +export default function DocumentPreview({ file, isProcessing, isFromHistory = false }) { + const [previewUrls, setPreviewUrls] = useState([]); + const [zoom, setZoom] = useState(100); + const [rotation, setRotation] = useState(0); + const objectUrlsRef = useRef([]); + + useEffect(() => { + if (!file) { + // Cleanup previous URLs + objectUrlsRef.current.forEach((url) => { + if (url && url.startsWith("blob:")) { + URL.revokeObjectURL(url); + } + }); + objectUrlsRef.current = []; + setPreviewUrls([]); + return; + } + + const loadPreview = async () => { + const urls = []; + const newObjectUrls = []; + + // Check if it's a PDF + if (file.type === "application/pdf" || file.name?.toLowerCase().endsWith(".pdf")) { + try { + // Use pdf.js to render PDF pages + const pdfjsLib = await import("pdfjs-dist"); + + // Configure worker - use jsdelivr CDN which is more reliable + // This will use the same version as the installed package + const version = pdfjsLib.version || "4.0.379"; + pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@${version}/build/pdf.worker.min.mjs`; + + const arrayBuffer = await file.arrayBuffer(); + const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise; + const numPages = pdf.numPages; + + for (let pageNum = 1; pageNum <= numPages; pageNum++) { + const page = await pdf.getPage(pageNum); + const viewport = page.getViewport({ scale: 2.0 }); + + const canvas = document.createElement("canvas"); + const context = canvas.getContext("2d"); + canvas.height = viewport.height; + canvas.width = viewport.width; + + await page.render({ + canvasContext: context, + viewport: viewport, + }).promise; + + urls.push(canvas.toDataURL("image/jpeg", 0.95)); + } + } catch (error) { + console.error("Error loading PDF:", error); + // Fallback: show error message + urls.push(null); + } + } else { + // For images, create object URL + const url = URL.createObjectURL(file); + urls.push(url); + newObjectUrls.push(url); + } + + // Cleanup old object URLs + objectUrlsRef.current.forEach((url) => { + if (url && url.startsWith("blob:")) { + URL.revokeObjectURL(url); + } + }); + objectUrlsRef.current = newObjectUrls; + setPreviewUrls(urls); + }; + + loadPreview(); + + // Cleanup function - revoke object URLs when component unmounts or file changes + return () => { + objectUrlsRef.current.forEach((url) => { + if (url && url.startsWith("blob:")) { + URL.revokeObjectURL(url); + } + }); + objectUrlsRef.current = []; + }; + }, [file]); + + return ( +
+ {/* Header */} +
+
+
+ +
+
+

Document Preview

+

{file?.name || "No file selected"}

+
+
+ + {file && ( +
+ + {zoom}% + +
+ +
+ )} +
+ + {/* Preview Area */} +
+ {!file ? ( +
+
+
+ +
+

Upload a document to preview

+
+
+ ) : previewUrls.length === 0 ? ( +
+
+
+ +
+

Loading preview...

+
+
+ ) : ( +
+ {previewUrls.map((url, index) => ( + + {url ? ( + {`Page + ) : ( +
+

+ {isFromHistory + ? "Original document not available for historical extractions" + : "Unable to load preview"} +

+
+ )} + + {/* Processing overlay */} + {isProcessing && ( + + + + )} + + {/* Page number */} + {previewUrls.length > 1 && ( +
+ Page {index + 1} +
+ )} +
+ ))} +
+ )} +
+
+ ); +} diff --git a/frontend/src/components/ocr/ExtractionOutput.jsx b/frontend/src/components/ocr/ExtractionOutput.jsx index 1cb07ee24747991bc71d6e49abfe914e496dbc8c..a33bb389c43100ed27f99c9a182bc6c7920c168b 100644 --- a/frontend/src/components/ocr/ExtractionOutput.jsx +++ b/frontend/src/components/ocr/ExtractionOutput.jsx @@ -1,639 +1,1201 @@ -import React, { useState, useEffect } from "react"; -import { motion, AnimatePresence } from "framer-motion"; -import { - Code2, - Copy, - Check, - Braces, - FileCode2, - FileText, - Sparkles, - ChevronDown, -} from "lucide-react"; -import { Button } from "@/components/ui/button"; -import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; -import { cn } from "@/lib/utils"; - -// Mock extracted data -const mockData = { - document: { - type: "Invoice", - confidence: 0.98, - }, - vendor: { - name: "Acme Corporation", - address: "123 Business Ave, Suite 400", - city: "San Francisco", - state: "CA", - zip: "94102", - phone: "+1 (555) 123-4567", - }, - invoice: { - number: "INV-2024-0847", - date: "2024-01-15", - due_date: "2024-02-14", - po_number: "PO-9823", - }, - items: [ - { description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 }, - { description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 }, - { description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 }, - ], - totals: { - subtotal: 7999.95, - tax_rate: 0.0875, - tax_amount: 699.99, - total: 8699.94, - }, -}; - -const mockXML = ` - - - - Acme Corporation -
123 Business Ave, Suite 400
- San Francisco - CA - 94102 -
- - INV-2024-0847 - 2024-01-15 - 2024-02-14 - - - - Professional Services - 40 - 6000.00 - - - - 7999.95 - 699.99 - 8699.94 - -
`; - -const mockText = `INVOICE - -ACME CORPORATION -123 Business Ave, Suite 400 -San Francisco, CA 94102 -Phone: +1 (555) 123-4567 - -Invoice Number: INV-2024-0847 -Invoice Date: January 15, 2024 -Due Date: February 14, 2024 -PO Number: PO-9823 - -BILL TO: -Customer Name -456 Client Street -New York, NY 10001 - -ITEMS: -───────────────────────────────────────────────────────── -Description Qty Unit Price Total -───────────────────────────────────────────────────────── -Professional Services 40 $150.00 $6,000.00 -Software License 5 $299.99 $1,499.95 -Support Package 1 $500.00 $500.00 -───────────────────────────────────────────────────────── - - Subtotal: $7,999.95 - Tax (8.75%): $699.99 - ───────────────────────── - TOTAL: $8,699.94 - -Payment Terms: Net 30 -Thank you for your business!`; - -// Helper function to convert object to XML -// Prepare fields for JSON/XML output - remove duplicates and restructure -function prepareFieldsForOutput(fields, format = "json") { - if (!fields || typeof fields !== "object") { - return fields; - } - - const output = { ...fields }; - - // Remove full_text from top-level if pages array exists (to avoid duplication) - if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { - delete output.full_text; - - // Clean up each page: remove full_text from page.fields (it duplicates page.text) - output.pages = output.pages.map(page => { - const cleanedPage = { ...page }; - if (cleanedPage.fields && typeof cleanedPage.fields === "object") { - const cleanedFields = { ...cleanedPage.fields }; - // Remove full_text from page fields (duplicates page.text) - delete cleanedFields.full_text; - cleanedPage.fields = cleanedFields; - } - return cleanedPage; - }); - } - - // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) - if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { - // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) - const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text")); - - output.pages.forEach((page, idx) => { - const pageNum = page.page_number || idx + 1; - const pageFields = page.fields || {}; - - // Remove duplicate fields from page.fields: - // 1. Remove full_text (duplicates page.text) - // 2. Remove fields that match top-level fields (already shown at root) - const cleanedPageFields = {}; - for (const [key, value] of Object.entries(pageFields)) { - // Skip full_text and fields that match top-level exactly - if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { - cleanedPageFields[key] = value; - } - } - - const pageObj = { - text: page.text || "", - confidence: page.confidence || 0, - doc_type: page.doc_type || "other" - }; - - // Only add fields if there are unique page-specific fields - if (Object.keys(cleanedPageFields).length > 0) { - pageObj.fields = cleanedPageFields; - } - - output[`page_${pageNum}`] = pageObj; - }); - // Remove pages array - we now have page_1, page_2, etc. as separate fields - delete output.pages; - } - - return output; -} - -function objectToXML(obj, rootName = "extraction") { - // Prepare fields - remove full_text if pages exist - const preparedObj = prepareFieldsForOutput(obj, "xml"); - - let xml = `\n<${rootName}>\n`; - - const convert = (obj, indent = " ") => { - for (const [key, value] of Object.entries(obj)) { - if (value === null || value === undefined) continue; - - // Skip full_text if pages exist (already handled in prepareFieldsForOutput) - if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { - continue; - } - - if (Array.isArray(value)) { - value.forEach((item) => { - xml += `${indent}<${key}>\n`; - if (typeof item === "object") { - convert(item, indent + " "); - } else { - xml += `${indent} ${escapeXML(String(item))}\n`; - } - xml += `${indent}\n`; - }); - } else if (typeof value === "object") { - xml += `${indent}<${key}>\n`; - convert(value, indent + " "); - xml += `${indent}\n`; - } else { - xml += `${indent}<${key}>${escapeXML(String(value))}\n`; - } - } - }; - - convert(preparedObj); - xml += ``; - return xml; -} - -function escapeXML(str) { - return str - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'"); -} - -// Helper function to format fields as readable text -function fieldsToText(fields) { - if (!fields || typeof fields !== "object") { - return "No data extracted."; - } - - // If full_text exists, show it prominently first - if (fields.full_text) { - let text = "=== FULL EXTRACTED TEXT ===\n\n"; - text += fields.full_text; - - // Don't show pages array separately if full_text already contains page markers - // (full_text from backend already includes "=== PAGE 1 ===" etc.) - const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page"); - - // Only show pages array if full_text doesn't already have page breakdown - if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) { - text += "\n\n=== TEXT BY PAGE ===\n\n"; - fields.pages.forEach((page, idx) => { - text += `--- Page ${page.page_number || idx + 1} ---\n`; - text += page.text || ""; - text += "\n\n"; - }); - } - - // Then show other structured fields - const otherFields = { ...fields }; - delete otherFields.full_text; - delete otherFields.pages; - - if (Object.keys(otherFields).length > 0) { - text += "\n\n=== STRUCTURED FIELDS ===\n\n"; - const formatValue = (key, value, indent = "") => { - if (Array.isArray(value)) { - text += `${indent}${key}:\n`; - value.forEach((item, idx) => { - if (typeof item === "object") { - text += `${indent} Item ${idx + 1}:\n`; - Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); - } else { - text += `${indent} - ${item}\n`; - } - }); - } else if (typeof value === "object" && value !== null) { - text += `${indent}${key}:\n`; - Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); - } else { - text += `${indent}${key}: ${value}\n`; - } - }; - - Object.entries(otherFields).forEach(([key, value]) => { - formatValue(key, value); - text += "\n"; - }); - } - - return text.trim(); - } - - // Fallback: format all fields normally - let text = ""; - const formatValue = (key, value, indent = "") => { - if (Array.isArray(value)) { - text += `${indent}${key}:\n`; - value.forEach((item, idx) => { - if (typeof item === "object") { - text += `${indent} Item ${idx + 1}:\n`; - Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); - } else { - text += `${indent} - ${item}\n`; - } - }); - } else if (typeof value === "object" && value !== null) { - text += `${indent}${key}:\n`; - Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); - } else { - text += `${indent}${key}: ${value}\n`; - } - }; - - Object.entries(fields).forEach(([key, value]) => { - formatValue(key, value); - text += "\n"; - }); - - return text.trim() || "No data extracted."; -} - -export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult }) { - const [activeTab, setActiveTab] = useState("json"); - const [copied, setCopied] = useState(false); - - // Get fields from extraction result, default to empty object - const fields = extractionResult?.fields || {}; - const confidence = extractionResult?.confidence || 0; - const fieldsExtracted = extractionResult?.fieldsExtracted || 0; - const totalTime = extractionResult?.totalTime || 0; - - // Initialize expanded sections based on available fields - const [expandedSections, setExpandedSections] = useState(() => - Object.keys(fields).slice(0, 5) // Expand first 5 sections by default - ); - - const handleCopy = () => { - let content = ""; - if (activeTab === "json") { - const preparedFields = prepareFieldsForOutput(fields, "json"); - content = JSON.stringify(preparedFields, null, 2); - } else if (activeTab === "xml") { - content = objectToXML(fields); - } else { - content = fieldsToText(fields); - } - - navigator.clipboard.writeText(content); - setCopied(true); - setTimeout(() => setCopied(false), 2000); - }; - - // Get prepared fields for display - const preparedFields = React.useMemo(() => { - return prepareFieldsForOutput(fields, "json"); - }, [fields]); - - // Update expanded sections when fields change - React.useEffect(() => { - if (extractionResult?.fields) { - setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5)); - } - }, [extractionResult]); - - const toggleSection = (section) => { - setExpandedSections((prev) => - prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section] - ); - }; - - const renderValue = (value) => { - if (typeof value === "number") { - return {value}; - } - if (typeof value === "string") { - return "{value}"; - } - return String(value); - }; - - const renderSection = (key, value, level = 0) => { - const isExpanded = expandedSections.includes(key); - const isObject = typeof value === "object" && value !== null; - const isArray = Array.isArray(value); - - if (!isObject) { - return ( -
- "{key}" - : - {renderValue(value)} -
- ); - } - - return ( -
- - - {isExpanded && ( - - {isArray ? ( - value.map((item, idx) => ( -
- {Object.entries(item).map(([k, v]) => renderSection(k, v, level + 2))} - {idx < value.length - 1 &&
} -
- )) - ) : ( - Object.entries(value).map(([k, v]) => renderSection(k, v, level + 1)) - )} -
- {isArray ? "]" : "}"} -
- - )} - -
- ); - }; - - return ( -
- {/* Header */} -
-
-
- -
-
-

Extracted Data

-

- {isComplete - ? `${fieldsExtracted} field${fieldsExtracted !== 1 ? 's' : ''} extracted` - : "Waiting for extraction"} -

-
-
- - {isComplete && ( -
- - - - - Text - - - - JSON - - - - XML - - - - -
- )} -
- - {/* Output Area */} -
- {!hasFile ? ( -
-
-
- -
-

Extracted data will appear here

-
-
- ) : isProcessing ? ( -
-
- - - -

Extracting data...

-

Analyzing document structure

- -
- {[0, 1, 2].map((i) => ( - - ))} -
-
-
- ) : isComplete && Object.keys(fields).length === 0 ? ( -
-
-
- -
-

No data extracted

-

The document may not contain extractable fields

-
-
- ) : ( -
- {activeTab === "text" ? ( -
-                {fieldsToText(fields)}
-              
- ) : activeTab === "json" ? ( -
- {"{"} - {Object.keys(preparedFields).length > 0 ? ( - Object.entries(preparedFields).map(([key, value]) => - renderSection(key, value, 1) - ) - ) : ( -
No fields extracted
- )} - {"}"} -
- ) : ( -
-                {objectToXML(fields).split("\n").map((line, i) => (
-                  
- {line.includes("<") ? ( - <> - {line.split(/(<\/?[\w\s=".-]+>)/g).map((part, j) => { - if (part.startsWith(" - {part} - - ); - } - if (part.startsWith("<")) { - return ( - - {part} - - ); - } - return ( - - {part} - - ); - })} - - ) : ( - line - )} -
- ))} -
- )} -
- )} -
- - {/* Confidence Footer */} - {isComplete && extractionResult && ( -
-
-
-
-
= 90 ? "bg-emerald-500" : confidence >= 70 ? "bg-amber-500" : "bg-red-500" - )} /> - Confidence: - - {confidence > 0 ? `${confidence.toFixed(1)}%` : "N/A"} - -
-
- Fields: - {fieldsExtracted} -
-
- - Processed in {totalTime >= 1000 ? `${(totalTime / 1000).toFixed(1)}s` : `${totalTime}ms`} - -
-
- )} -
- ); -} +import React, { useState, useEffect, useRef } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { + Code2, + Copy, + Check, + Braces, + FileCode2, + FileText, + Sparkles, + ChevronDown, + Upload, +} from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; +import { cn } from "@/lib/utils"; + +// Helper function to convert pipe-separated tables to HTML tables +function convertPipeTablesToHTML(text) { + if (!text) return text; + + const lines = text.split('\n'); + const result = []; + let i = 0; + + while (i < lines.length) { + const line = lines[i]; + + // Check if this line looks like a table row (has multiple pipes) + if (line.includes('|') && line.split('|').length >= 3) { + // Check if it's a separator line (only |, -, :, spaces) + const isSeparator = /^[\s|\-:]+$/.test(line.trim()); + + if (!isSeparator) { + // Start of a table - collect all table rows + const tableRows = []; + let j = i; + + // Collect header row + const headerLine = lines[j]; + const headerCells = headerLine.split('|').map(cell => cell.trim()).filter(cell => cell || cell === ''); + // Remove empty cells at start/end + if (headerCells.length > 0 && !headerCells[0]) headerCells.shift(); + if (headerCells.length > 0 && !headerCells[headerCells.length - 1]) headerCells.pop(); + + if (headerCells.length >= 2) { + tableRows.push(headerCells); + j++; + + // Skip separator line if present + if (j < lines.length && /^[\s|\-:]+$/.test(lines[j].trim())) { + j++; + } + + // Collect data rows + while (j < lines.length) { + const rowLine = lines[j]; + if (!rowLine.trim()) break; // Empty line ends table + + // Check if it's still a table row + if (rowLine.includes('|') && rowLine.split('|').length >= 2) { + const isRowSeparator = /^[\s|\-:]+$/.test(rowLine.trim()); + if (!isRowSeparator) { + const rowCells = rowLine.split('|').map(cell => cell.trim()); + // Remove empty cells at start/end + if (rowCells.length > 0 && !rowCells[0]) rowCells.shift(); + if (rowCells.length > 0 && !rowCells[rowCells.length - 1]) rowCells.pop(); + tableRows.push(rowCells); + j++; + } else { + j++; + } + } else { + break; // Not a table row anymore + } + } + + // Convert to HTML table + if (tableRows.length > 0) { + let htmlTable = '\n\n'; + + // Header row + tableRows[0].forEach(cell => { + htmlTable += ``; + }); + htmlTable += '\n\n\n'; + + // Data rows + for (let rowIdx = 1; rowIdx < tableRows.length; rowIdx++) { + htmlTable += ''; + tableRows[rowIdx].forEach((cell, colIdx) => { + // Use header cell count to ensure alignment + const cellContent = cell || ''; + htmlTable += ``; + }); + htmlTable += '\n'; + } + + htmlTable += '\n
${escapeHtml(cell)}
${escapeHtml(cellContent)}
'; + result.push(htmlTable); + i = j; + continue; + } + } + } + } + + // Not a table row, add as-is + result.push(line); + i++; + } + + return result.join('\n'); +} + +// Helper function to escape HTML +function escapeHtml(text) { + if (!text) return ''; + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} + +// Helper function to convert markdown/HTML text to safe HTML +function renderMarkdownToHTML(text) { + if (!text) return ""; + + let html = text; + + // FIRST: Convert pipe-separated tables to HTML tables + html = convertPipeTablesToHTML(html); + + // Convert LaTeX-style superscripts/subscripts FIRST (before protecting tables) + // This ensures they're converted everywhere, including inside tables + + // Convert LaTeX-style superscripts: $^{text}$ or $^text$ to text + html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '$1'); + html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '$1'); + + // Convert LaTeX-style subscripts: $_{text}$ or $_text$ to text + html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '$1'); + html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '$1'); + + // Split by HTML tags to preserve existing HTML (like tables) + // Process markdown only in non-HTML sections + + // First, protect existing HTML blocks (tables, etc.) + const htmlBlocks = []; + let htmlBlockIndex = 0; + + // Extract and protect HTML table blocks + html = html.replace(//gi, (match) => { + const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`; + htmlBlocks[htmlBlockIndex] = match; + htmlBlockIndex++; + return placeholder; + }); + + // Convert markdown headers (only if not inside HTML) + html = html.replace(/^### (.*$)/gim, '

$1

'); + html = html.replace(/^## (.*$)/gim, '

$1

'); + html = html.replace(/^# (.*$)/gim, '

$1

'); + + // Convert markdown bold/italic (but not inside HTML tags) + html = html.replace(/\*\*(.*?)\*\*/g, '$1'); + html = html.replace(/\*(.*?)\*/g, '$1'); + + // Convert markdown links + html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1'); + + // Convert line breaks to paragraphs (but preserve structure around HTML blocks) + const parts = html.split(/(__HTML_BLOCK_\d+__)/); + const processedParts = parts.map((part, index) => { + if (part.match(/^__HTML_BLOCK_\d+__$/)) { + // Restore HTML block + const blockIndex = parseInt(part.match(/\d+/)[0]); + return htmlBlocks[blockIndex]; + } else { + // Process markdown in this part + let processed = part; + + // Convert double line breaks to paragraph breaks + processed = processed.replace(/\n\n+/g, '

'); + // Convert single line breaks to
(but not if already in a tag) + processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1
$2'); + + // Wrap in paragraph if there's content + if (processed.trim() && !processed.trim().startsWith('<')) { + processed = '

' + processed + '

'; + } + + return processed; + } + }); + + html = processedParts.join(''); + + // Process LaTeX notation in restored HTML blocks (tables) as well + // This handles any LaTeX that might be in table cells + html = html.replace(/(]*>|]*>)([^<]*)\$\s*\^\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, + (match, openTag, before, supText, after, closeTag) => { + return openTag + before + '' + supText + '' + after + closeTag; + }); + html = html.replace(/(]*>|]*>)([^<]*)\$\s*\^\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, + (match, openTag, before, supText, after, closeTag) => { + return openTag + before + '' + supText + '' + after + closeTag; + }); + html = html.replace(/(]*>|]*>)([^<]*)\$\s*_\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, + (match, openTag, before, subText, after, closeTag) => { + return openTag + before + '' + subText + '' + after + closeTag; + }); + html = html.replace(/(]*>|]*>)([^<]*)\$\s*_\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, + (match, openTag, before, subText, after, closeTag) => { + return openTag + before + '' + subText + '' + after + closeTag; + }); + + // Clean up empty paragraphs and fix paragraph structure + html = html.replace(/

<\/p>/g, ''); + html = html.replace(/

\s*
\s*<\/p>/g, ''); + html = html.replace(/

\s*<\/p>/g, ''); + + // Ensure proper spacing around HTML blocks + html = html.replace(/(<\/table>)\s*(

$2'); + html = html.replace(/(<\/h[1-3]>)\s*($2'); + html = html.replace(/(<\/table>)\s*(

)/g, '$1$2'); + + return html; +} + +// Mock extracted data +const mockData = { + document: { + type: "Invoice", + confidence: 0.98, + }, + vendor: { + name: "Acme Corporation", + address: "123 Business Ave, Suite 400", + city: "San Francisco", + state: "CA", + zip: "94102", + phone: "+1 (555) 123-4567", + }, + invoice: { + number: "INV-2024-0847", + date: "2024-01-15", + due_date: "2024-02-14", + po_number: "PO-9823", + }, + items: [ + { description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 }, + { description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 }, + { description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 }, + ], + totals: { + subtotal: 7999.95, + tax_rate: 0.0875, + tax_amount: 699.99, + total: 8699.94, + }, +}; + +const mockXML = ` + + + + Acme Corporation +

123 Business Ave, Suite 400
+ San Francisco + CA + 94102 + + + INV-2024-0847 + 2024-01-15 + 2024-02-14 + + + + Professional Services + 40 + 6000.00 + + + + 7999.95 + 699.99 + 8699.94 + +`; + +const mockText = `INVOICE + +ACME CORPORATION +123 Business Ave, Suite 400 +San Francisco, CA 94102 +Phone: +1 (555) 123-4567 + +Invoice Number: INV-2024-0847 +Invoice Date: January 15, 2024 +Due Date: February 14, 2024 +PO Number: PO-9823 + +BILL TO: +Customer Name +456 Client Street +New York, NY 10001 + +ITEMS: +───────────────────────────────────────────────────────── +Description Qty Unit Price Total +───────────────────────────────────────────────────────── +Professional Services 40 $150.00 $6,000.00 +Software License 5 $299.99 $1,499.95 +Support Package 1 $500.00 $500.00 +───────────────────────────────────────────────────────── + + Subtotal: $7,999.95 + Tax (8.75%): $699.99 + ───────────────────────── + TOTAL: $8,699.94 + +Payment Terms: Net 30 +Thank you for your business!`; + +// Helper function to convert object to XML +// Prepare fields for JSON/XML output - remove duplicates and restructure +function prepareFieldsForOutput(fields, format = "json") { + if (!fields || typeof fields !== "object") { + return fields; + } + + const output = { ...fields }; + + // Extract Fields from root level if it exists + const rootFields = output.Fields; + // Remove Fields from output temporarily (will be added back at top) + delete output.Fields; + + // Remove full_text from top-level if pages array exists (to avoid duplication) + if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { + delete output.full_text; + + // Clean up each page: remove full_text from page.fields (it duplicates page.text) + output.pages = output.pages.map(page => { + const cleanedPage = { ...page }; + if (cleanedPage.fields && typeof cleanedPage.fields === "object") { + const cleanedFields = { ...cleanedPage.fields }; + // Remove full_text from page fields (duplicates page.text) + delete cleanedFields.full_text; + cleanedPage.fields = cleanedFields; + } + return cleanedPage; + }); + } + + // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) + if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { + // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) + const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields")); + + output.pages.forEach((page, idx) => { + const pageNum = page.page_number || idx + 1; + const pageFields = page.fields || {}; + + // Remove duplicate fields from page.fields: + // 1. Remove full_text (duplicates page.text) + // 2. Remove fields that match top-level fields (already shown at root) + const cleanedPageFields = {}; + for (const [key, value] of Object.entries(pageFields)) { + // Skip full_text and fields that match top-level exactly + if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { + cleanedPageFields[key] = value; + } + } + + const pageObj = { + text: page.text || "", + confidence: page.confidence || 0, + doc_type: page.doc_type || "other" + }; + + // Add table and footer_notes if they exist + if (page.table && Array.isArray(page.table) && page.table.length > 0) { + pageObj.table = page.table; + } + if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) { + pageObj.footer_notes = page.footer_notes; + } + + // Only add fields if there are unique page-specific fields + if (Object.keys(cleanedPageFields).length > 0) { + pageObj.fields = cleanedPageFields; + } + + output[`page_${pageNum}`] = pageObj; + }); + // Remove pages array - we now have page_1, page_2, etc. as separate fields + delete output.pages; + } + + // Handle page_X structure (from backend) - remove Fields from page objects if they exist + if (output && typeof output === "object") { + const pageKeys = Object.keys(output).filter(k => k.startsWith("page_")); + for (const pageKey of pageKeys) { + const pageData = output[pageKey]; + if (pageData && typeof pageData === "object") { + // Remove Fields from page objects (it's now at root level) + delete pageData.Fields; + delete pageData.metadata; + } + } + } + + // Rebuild output with Fields at the top (only if it exists and is not empty) + const finalOutput = {}; + if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) { + finalOutput.Fields = rootFields; + } + + // Add all other keys + Object.keys(output).forEach(key => { + finalOutput[key] = output[key]; + }); + + return finalOutput; +} + +function objectToXML(obj, rootName = "extraction") { + // Prepare fields - remove full_text if pages exist + const preparedObj = prepareFieldsForOutput(obj, "xml"); + + let xml = `\n<${rootName}>\n`; + + const convert = (obj, indent = " ") => { + for (const [key, value] of Object.entries(obj)) { + if (value === null || value === undefined) continue; + + // Skip full_text if pages exist (already handled in prepareFieldsForOutput) + if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { + continue; + } + + if (Array.isArray(value)) { + value.forEach((item) => { + xml += `${indent}<${key}>\n`; + if (typeof item === "object") { + convert(item, indent + " "); + } else { + xml += `${indent} ${escapeXML(String(item))}\n`; + } + xml += `${indent}\n`; + }); + } else if (typeof value === "object") { + xml += `${indent}<${key}>\n`; + convert(value, indent + " "); + xml += `${indent}\n`; + } else { + xml += `${indent}<${key}>${escapeXML(String(value))}\n`; + } + } + }; + + convert(preparedObj); + xml += ``; + return xml; +} + +function escapeXML(str) { + return str + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +// Helper function to extract text from page structure +function extractTextFromFields(fields) { + if (!fields || typeof fields !== "object") { + return ""; + } + + // Check for page_X structure first (preferred format) + const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_")); + if (pageKeys.length > 0) { + // Get text from first page (or combine all pages) + const pageTexts = pageKeys.map(key => { + const page = fields[key]; + if (page && page.text) { + return page.text; + } + return ""; + }).filter(text => text); + + if (pageTexts.length > 0) { + return pageTexts.join("\n\n"); + } + } + + // Fallback to full_text + if (fields.full_text) { + return fields.full_text; + } + + return ""; +} + +// Helper function to format fields as readable text +function fieldsToText(fields) { + if (!fields || typeof fields !== "object") { + return "No data extracted."; + } + + // Extract text from page structure or full_text + const extractedText = extractTextFromFields(fields); + + if (extractedText) { + return extractedText; + + // Don't show pages array separately if full_text already contains page markers + // (full_text from backend already includes "=== PAGE 1 ===" etc.) + const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page"); + + // Only show pages array if full_text doesn't already have page breakdown + if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) { + text += "\n\n=== TEXT BY PAGE ===\n\n"; + fields.pages.forEach((page, idx) => { + text += `--- Page ${page.page_number || idx + 1} ---\n`; + text += page.text || ""; + text += "\n\n"; + }); + } + + // Then show other structured fields + const otherFields = { ...fields }; + delete otherFields.full_text; + delete otherFields.pages; + + if (Object.keys(otherFields).length > 0) { + text += "\n\n=== STRUCTURED FIELDS ===\n\n"; + const formatValue = (key, value, indent = "") => { + if (Array.isArray(value)) { + text += `${indent}${key}:\n`; + value.forEach((item, idx) => { + if (typeof item === "object") { + text += `${indent} Item ${idx + 1}:\n`; + Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); + } else { + text += `${indent} - ${item}\n`; + } + }); + } else if (typeof value === "object" && value !== null) { + text += `${indent}${key}:\n`; + Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); + } else { + text += `${indent}${key}: ${value}\n`; + } + }; + + Object.entries(otherFields).forEach(([key, value]) => { + formatValue(key, value); + text += "\n"; + }); + } + + return text.trim(); + } + + // Fallback: format all fields normally + let text = ""; + const formatValue = (key, value, indent = "") => { + if (Array.isArray(value)) { + text += `${indent}${key}:\n`; + value.forEach((item, idx) => { + if (typeof item === "object") { + text += `${indent} Item ${idx + 1}:\n`; + Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); + } else { + text += `${indent} - ${item}\n`; + } + }); + } else if (typeof value === "object" && value !== null) { + text += `${indent}${key}:\n`; + Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); + } else { + text += `${indent}${key}: ${value}\n`; + } + }; + + Object.entries(fields).forEach(([key, value]) => { + formatValue(key, value); + text += "\n"; + }); + + return text.trim() || "No data extracted."; +} + +export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult, onNewUpload }) { + const [activeTab, setActiveTab] = useState("json"); + const [copied, setCopied] = useState(false); + const [statusMessage, setStatusMessage] = useState("Preparing document..."); + + // Get fields from extraction result, default to empty object + const fields = extractionResult?.fields || {}; + const confidence = extractionResult?.confidence || 0; + const fieldsExtracted = extractionResult?.fieldsExtracted || 0; + const totalTime = extractionResult?.totalTime || 0; + + // Dynamic status messages that rotate during processing + const statusMessages = [ + "Preparing document...", + "Converting pages to images...", + "Visual Reasoning...", + "Reading text from document...", + "Identifying document structure...", + "Extracting tables and data...", + "Analyzing content...", + "Processing pages...", + "Organizing extracted information...", + "Finalizing results...", + ]; + + // Rotate status messages during processing + const messageIndexRef = useRef(0); + + useEffect(() => { + if (!isProcessing) { + setStatusMessage("Analyzing document structure"); + messageIndexRef.current = 0; + return; + } + + setStatusMessage(statusMessages[0]); + messageIndexRef.current = 0; + + const interval = setInterval(() => { + messageIndexRef.current = (messageIndexRef.current + 1) % statusMessages.length; + setStatusMessage(statusMessages[messageIndexRef.current]); + }, 2500); // Change message every 2.5 seconds + + return () => clearInterval(interval); + }, [isProcessing]); + + // Initialize expanded sections based on available fields + const [expandedSections, setExpandedSections] = useState(() => + Object.keys(fields).slice(0, 5) // Expand first 5 sections by default + ); + + // Helper function to convert HTML to formatted plain text with layout preserved + const htmlToFormattedText = (html) => { + if (!html) return ""; + + // Create a temporary div to parse HTML + const tempDiv = document.createElement("div"); + tempDiv.innerHTML = html; + + let text = ""; + + // Process each element + const processNode = (node) => { + if (node.nodeType === Node.TEXT_NODE) { + return node.textContent; + } + + if (node.nodeType !== Node.ELEMENT_NODE) { + return ""; + } + + const tagName = node.tagName?.toLowerCase(); + const children = Array.from(node.childNodes); + + switch (tagName) { + case "h1": + return "\n\n" + processChildren(children).trim() + "\n\n"; + case "h2": + return "\n\n" + processChildren(children).trim() + "\n\n"; + case "h3": + return "\n" + processChildren(children).trim() + "\n"; + case "p": + return processChildren(children) + "\n\n"; + case "br": + return "\n"; + case "strong": + case "b": + return processChildren(children); + case "em": + case "i": + return processChildren(children); + case "sup": + return processChildren(children); + case "sub": + return processChildren(children); + case "table": + return "\n" + processTable(node) + "\n\n"; + case "ul": + case "ol": + return "\n" + processList(node) + "\n\n"; + case "li": + return " • " + processChildren(children).trim() + "\n"; + default: + return processChildren(children); + } + }; + + const processChildren = (children) => { + return children.map(processNode).join(""); + }; + + const processTable = (table) => { + let tableText = ""; + const rows = table.querySelectorAll("tr"); + + if (rows.length === 0) return ""; + + // First pass: calculate column widths + const allRows = Array.from(rows); + const columnCount = Math.max(...allRows.map(row => row.querySelectorAll("td, th").length)); + const columnWidths = new Array(columnCount).fill(0); + + allRows.forEach(row => { + const cells = row.querySelectorAll("td, th"); + cells.forEach((cell, colIndex) => { + const cellText = processChildren(Array.from(cell.childNodes)).trim().replace(/\s+/g, " "); + columnWidths[colIndex] = Math.max(columnWidths[colIndex] || 0, cellText.length, 10); + }); + }); + + // Second pass: format rows + allRows.forEach((row, rowIndex) => { + const cells = row.querySelectorAll("td, th"); + const cellTexts = Array.from(cells).map(cell => { + let cellContent = processChildren(Array.from(cell.childNodes)).trim(); + cellContent = cellContent.replace(/\s+/g, " "); + return cellContent; + }); + + // Pad cells to column widths + const paddedCells = cellTexts.map((text, i) => { + const width = columnWidths[i] || 10; + return text.padEnd(width); + }); + + tableText += paddedCells.join(" | ") + "\n"; + + // Add separator after header row + if (rowIndex === 0 && row.querySelector("th")) { + tableText += columnWidths.map(w => "-".repeat(w)).join("-|-") + "\n"; + } + }); + + return tableText; + }; + + const processList = (list) => { + const items = list.querySelectorAll("li"); + return Array.from(items).map(item => { + return " • " + processChildren(Array.from(item.childNodes)).trim(); + }).join("\n"); + }; + + text = processChildren(Array.from(tempDiv.childNodes)); + + // Clean up extra newlines + text = text.replace(/\n{3,}/g, "\n\n"); + text = text.trim(); + + return text; + }; + + const handleCopy = () => { + let content = ""; + if (activeTab === "json") { + const preparedFields = prepareFieldsForOutput(fields, "json"); + content = JSON.stringify(preparedFields, null, 2); + } else if (activeTab === "xml") { + content = objectToXML(fields); + } else { + // For text tab, get the formatted HTML and convert to plain text with layout + const textContent = extractTextFromFields(fields); + const htmlContent = renderMarkdownToHTML(textContent); + content = htmlToFormattedText(htmlContent); + } + + navigator.clipboard.writeText(content); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + }; + + // Get prepared fields for display + const preparedFields = React.useMemo(() => { + return prepareFieldsForOutput(fields, "json"); + }, [fields]); + + // Update expanded sections when fields change + React.useEffect(() => { + if (extractionResult?.fields) { + setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5)); + } + }, [extractionResult]); + + const toggleSection = (section) => { + setExpandedSections((prev) => + prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section] + ); + }; + + const renderValue = (value) => { + if (typeof value === "number") { + return {value}; + } + if (typeof value === "string") { + return "{value}"; + } + return String(value); + }; + + const renderSection = (key, value, level = 0) => { + const isExpanded = expandedSections.includes(key); + const isObject = typeof value === "object" && value !== null; + const isArray = Array.isArray(value); + + if (!isObject) { + return ( +
+ "{key}" + : + {renderValue(value)} +
+ ); + } + + return ( +
+ + + {isExpanded && ( + + {isArray ? ( + value.map((item, idx) => ( +
+ {Object.entries(item).map(([k, v]) => renderSection(k, v, level + 2))} + {idx < value.length - 1 &&
} +
+ )) + ) : ( + Object.entries(value).map(([k, v]) => renderSection(k, v, level + 1)) + )} +
+ {isArray ? "]" : "}"} +
+ + )} + +
+ ); + }; + + return ( +
+ {/* Header */} +
+
+
+ +
+
+

Extracted Data

+

+ {isComplete + ? `${fieldsExtracted} field${fieldsExtracted !== 1 ? 's' : ''} extracted` + : "Waiting for extraction"} +

+
+ {isComplete && onNewUpload && ( + + )} +
+ + {isComplete && ( +
+ + + + + Text + + + + JSON + + + + XML + + + + +
+ )} +
+ + {/* Output Area */} +
+ {!hasFile ? ( +
+
+
+ +
+

Extracted data will appear here

+
+
+ ) : isProcessing ? ( +
+
+ + + +

Extracting data...

+

{statusMessage}

+ +
+ {[0, 1, 2].map((i) => ( + + ))} +
+
+
+ ) : isComplete && Object.keys(fields).length === 0 ? ( +
+
+
+ +
+

No data extracted

+

The document may not contain extractable fields

+
+
+ ) : ( +
+ {activeTab === "text" ? ( +
+
+ +
+ ) : activeTab === "json" ? ( +
+ {"{"} + {Object.keys(preparedFields).length > 0 ? ( + Object.entries(preparedFields).map(([key, value]) => + renderSection(key, value, 1) + ) + ) : ( +
No fields extracted
+ )} + {"}"} +
+ ) : ( +
+                {objectToXML(fields).split("\n").map((line, i) => (
+                  
+ {line.includes("<") ? ( + <> + {line.split(/(<\/?[\w\s=".-]+>)/g).map((part, j) => { + if (part.startsWith(" + {part} + + ); + } + if (part.startsWith("<")) { + return ( + + {part} + + ); + } + return ( + + {part} + + ); + })} + + ) : ( + line + )} +
+ ))} +
+ )} +
+ )} +
+ + {/* Confidence Footer */} + {isComplete && extractionResult && ( +
+
+
+
+
= 90 ? "bg-emerald-500" : confidence >= 70 ? "bg-amber-500" : "bg-red-500" + )} /> + Confidence: + + {confidence > 0 ? `${confidence.toFixed(1)}%` : "N/A"} + +
+
+ Fields: + {fieldsExtracted} +
+
+ + Processed in {totalTime >= 1000 ? `${(totalTime / 1000).toFixed(1)}s` : `${totalTime}ms`} + +
+
+ )} +
+ ); +} diff --git a/frontend/src/components/ocr/ProcessingStatus.jsx b/frontend/src/components/ocr/ProcessingStatus.jsx index acc49ab7b4dbaebde4610f0f2fb4dde909246866..e4613505cf8f7b2ae2208b3195ef84a6f8de9f99 100644 --- a/frontend/src/components/ocr/ProcessingStatus.jsx +++ b/frontend/src/components/ocr/ProcessingStatus.jsx @@ -1,111 +1,118 @@ -import React from "react"; -import { motion } from "framer-motion"; -import { - FileSearch, - Cpu, - TableProperties, - CheckCircle2, - Loader2, -} from "lucide-react"; -import { cn } from "@/lib/utils"; - -const steps = [ - { id: "upload", label: "Received", icon: FileSearch }, - { id: "analyze", label: "Analysis", icon: Cpu }, - { id: "extract", label: "Extraction", icon: TableProperties }, - { id: "complete", label: "Done", icon: CheckCircle2 }, -]; - -export default function ProcessingStatus({ isProcessing, isComplete }) { - const getCurrentStep = () => { - if (isComplete) return 4; - if (isProcessing) return 2; - return 0; - }; - - const currentStep = getCurrentStep(); - - if (!isProcessing && !isComplete) return null; - - return ( - -
- {steps.map((step, index) => { - const isActive = index + 1 === currentStep; - const isCompleted = index + 1 < currentStep || isComplete; - const Icon = step.icon; - - return ( - -
- - {(isActive && !isComplete) ? ( - - - - ) : isCompleted ? ( - - ) : ( - - )} - - -
- - {index < steps.length - 1 && ( -
- -
- )} -
- ); - })} -
-
- ); -} +import React from "react"; +import { motion } from "framer-motion"; +import { + FileSearch, + Cpu, + TableProperties, + CheckCircle2, + Loader2, +} from "lucide-react"; +import { cn } from "@/lib/utils"; + +const steps = [ + { id: "upload", label: "Received", icon: FileSearch }, + { id: "analyze", label: "Analysis", icon: Cpu }, + { id: "extract", label: "Extraction", icon: TableProperties }, + { id: "complete", label: "Done", icon: CheckCircle2 }, +]; + +export default function ProcessingStatus({ isProcessing, isComplete, currentStage }) { + const getCurrentStep = () => { + if (isComplete) return 4; // Done + if (!isProcessing) return 0; // Not started + + // Use provided currentStage or default based on isProcessing + if (currentStage === "extraction") return 3; // Extraction + if (currentStage === "analysis") return 2; // Analysis + if (currentStage === "received") return 1; // Received + + // Default: if processing, start at Analysis + return 2; // Analysis + }; + + const currentStep = getCurrentStep(); + + if (!isProcessing && !isComplete) return null; + + return ( + +
+ {steps.map((step, index) => { + const isActive = index + 1 === currentStep; + const isCompleted = index + 1 < currentStep || isComplete; + const Icon = step.icon; + + return ( + +
+ + {(isActive && !isComplete) ? ( + + + + ) : isCompleted ? ( + + ) : ( + + )} + + +
+ + {index < steps.length - 1 && ( +
+ +
+ )} +
+ ); + })} +
+
+ ); +} diff --git a/frontend/src/components/ocr/UpgradeModal.jsx b/frontend/src/components/ocr/UpgradeModal.jsx new file mode 100644 index 0000000000000000000000000000000000000000..d3a6a679b6c0993187a9dd9a896d12dd322bca1c --- /dev/null +++ b/frontend/src/components/ocr/UpgradeModal.jsx @@ -0,0 +1,213 @@ +import React from "react"; +import { motion } from "framer-motion"; +import { cn } from "@/lib/utils"; +import { + X, + Sparkles, + Zap, + Shield, + Cloud, + BarChart3, + Bot, + Globe, + Lock, + Rocket, + Users, + CheckCircle2, + ArrowRight +} from "lucide-react"; +import { Button } from "@/components/ui/button"; + +const features = [ + { + icon: Zap, + title: "Production-Scale Processing", + description: "Remove trial limits and run live AP and operations workflows", + color: "amber", + cta: "Explore with a demo", + gradient: "from-amber-500 to-orange-500" + }, + { + icon: Bot, + title: "Advanced Agentic Processing", + description: "You can customize your own agentic pipeline with your own data", + color: "indigo", + cta: "Talk to Sales", + gradient: "from-indigo-500 to-violet-500" + }, + { + icon: Cloud, + title: "API Access", + description: "Integrate EZOFIS into your workflow with our REST API", + color: "blue", + cta: "Talk to a Techie!", + gradient: "from-blue-500 to-cyan-500" + } +]; + +export default function UpgradeModal({ open, onClose }) { + if (!open) return null; + + return ( +
+ {/* Backdrop */} + + + {/* Modal */} + e.stopPropagation()} + > + {/* Header */} +
+ + + +
+ + Trial Limit Reached +
+

You've processed 2 documents

+

Continue with production-ready document intelligence

+
+
+ + {/* Stats Bar */} +
+ {[ + { label: "Accuracy Rate", value: "99.8%", icon: CheckCircle2 }, + { label: "Processing Speed", value: "< 10s", icon: Zap }, + { label: "Operational Users", value: "10,000+", icon: Users } + ].map((stat, i) => ( + +
+ + {stat.value} +
+

{stat.label}

+
+ ))} +
+ + {/* Features Grid - Scrollable */} +
+
+

+ Continue to Production Use +

+ +
+ +
+ {features.map((feature, index) => ( + + {/* Gradient Background on Hover */} +
+ +
+
+ +
+

{feature.title}

+

{feature.description}

+ + +
+ + ))} +
+
+ + {/* CTA Footer */} +
+
+
+

Ready to scale?

+

No commitment. We’ll tailor the demo to your documents and workflows.

+
+
+ + +
+
+
+ +
+ ); +} + diff --git a/frontend/src/components/ocr/UploadZone.jsx b/frontend/src/components/ocr/UploadZone.jsx index 258a8d0ee8fcf1b0d08b87a32e2ced2b8eba94d9..c438b614778e7caad91203da21a63e93357ab96f 100644 --- a/frontend/src/components/ocr/UploadZone.jsx +++ b/frontend/src/components/ocr/UploadZone.jsx @@ -1,147 +1,251 @@ -import React, { useState } from "react"; -import { motion, AnimatePresence } from "framer-motion"; -import { Upload, FileText, Image, FileSpreadsheet, X, Sparkles } from "lucide-react"; -import { cn } from "@/lib/utils"; - -export default function UploadZone({ onFileSelect, selectedFile, onClear }) { - const [isDragging, setIsDragging] = useState(false); - - const handleDragOver = (e) => { - e.preventDefault(); - setIsDragging(true); - }; - - const handleDragLeave = () => { - setIsDragging(false); - }; - - const handleDrop = (e) => { - e.preventDefault(); - setIsDragging(false); - const file = e.dataTransfer.files[0]; - if (file) onFileSelect(file); - }; - - const getFileIcon = (type) => { - if (type?.includes("image")) return Image; - if (type?.includes("spreadsheet") || type?.includes("excel")) return FileSpreadsheet; - return FileText; - }; - - const FileIcon = selectedFile ? getFileIcon(selectedFile.type) : FileText; - - return ( -
- - {!selectedFile ? ( - - - - {/* Decorative gradient border on hover */} -
- - ) : ( - -
-
- -
-
-

{selectedFile.name}

-
- {(selectedFile.size / 1024 / 1024).toFixed(2)} MB - - - - Ready for extraction - -
-
- -
-
- )} - -
- ); -} +import React, { useState, useEffect } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { Upload, FileText, Image, FileSpreadsheet, X, Sparkles, AlertCircle } from "lucide-react"; +import { cn } from "@/lib/utils"; +import { Input } from "@/components/ui/input"; + +// Allowed file types +const ALLOWED_TYPES = [ + "application/pdf", + "image/png", + "image/jpeg", + "image/jpg", + "image/tiff", + "image/tif" +]; + +// Allowed file extensions (for fallback validation) +const ALLOWED_EXTENSIONS = [".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif"]; + +// Maximum file size: 4 MB +const MAX_FILE_SIZE = 4 * 1024 * 1024; // 4 MB in bytes + +export default function UploadZone({ onFileSelect, selectedFile, onClear, keyFields = "", onKeyFieldsChange = () => {} }) { + const [isDragging, setIsDragging] = useState(false); + const [error, setError] = useState(null); + + const validateFile = (file) => { + // Reset error + setError(null); + + // Check file type + const fileExtension = "." + file.name.split(".").pop().toLowerCase(); + const isValidType = ALLOWED_TYPES.includes(file.type) || ALLOWED_EXTENSIONS.includes(fileExtension); + + if (!isValidType) { + setError("Only PDF, PNG, JPG, and TIFF files are allowed."); + return false; + } + + // Check file size + if (file.size > MAX_FILE_SIZE) { + const fileSizeMB = (file.size / 1024 / 1024).toFixed(2); + setError(`File size exceeds 4 MB limit. Your file is ${fileSizeMB} MB.`); + return false; + } + + return true; + }; + + const handleFileSelect = (file) => { + if (validateFile(file)) { + setError(null); + onFileSelect(file); + } + }; + + const handleDragOver = (e) => { + e.preventDefault(); + setIsDragging(true); + }; + + const handleDragLeave = () => { + setIsDragging(false); + }; + + const handleDrop = (e) => { + e.preventDefault(); + setIsDragging(false); + const file = e.dataTransfer.files[0]; + if (file) { + handleFileSelect(file); + } + }; + + const getFileIcon = (type) => { + if (type?.includes("image")) return Image; + if (type?.includes("spreadsheet") || type?.includes("excel")) return FileSpreadsheet; + return FileText; + }; + + const FileIcon = selectedFile ? getFileIcon(selectedFile.type) : FileText; + + // Clear error when file is cleared + useEffect(() => { + if (!selectedFile) { + setError(null); + } + }, [selectedFile]); + + return ( +
+ + {!selectedFile ? ( + + + + {/* Decorative gradient border on hover */} +
+ + ) : ( + + {/* File Info Box */} +
+
+
+ +
+
+

{selectedFile.name}

+
+ {(selectedFile.size / 1024 / 1024).toFixed(2)} MB + + + + Ready for extraction + +
+
+ +
+
+ + {/* Key Fields Box */} +
+ + { + if (onKeyFieldsChange) { + onKeyFieldsChange(e.target.value); + } + }} + placeholder="Invoice Number, Invoice Date, PO Number, Supplier Name, Total Amount, Payment terms, Additional Notes" + className="h-8 text-xs border-slate-200 focus:border-indigo-300 focus:ring-indigo-200" + /> +
+
+ )} + + + {/* Error Message */} + {error && ( + + +

{error}

+ +
+ )} +
+ ); +} diff --git a/frontend/src/components/ui/badge.jsx b/frontend/src/components/ui/badge.jsx index a5b3422b457892c5e57ee618df58509ba540d1f8..f89fc7d2ad0cee28fbb390a47292b4b7624bf942 100644 --- a/frontend/src/components/ui/badge.jsx +++ b/frontend/src/components/ui/badge.jsx @@ -1,24 +1,24 @@ -import React from "react"; -import { cn } from "@/lib/utils"; - -const variants = { - default: - "bg-slate-900 text-white hover:bg-slate-900/90", - secondary: - "bg-slate-100 text-slate-800 border border-slate-200", - outline: - "border border-slate-200 text-slate-700", -}; - -export function Badge({ className, variant = "default", ...props }) { - return ( - - ); -} +import React from "react"; +import { cn } from "@/lib/utils"; + +const variants = { + default: + "bg-slate-900 text-white hover:bg-slate-900/90", + secondary: + "bg-slate-100 text-slate-800 border border-slate-200", + outline: + "border border-slate-200 text-slate-700", +}; + +export function Badge({ className, variant = "default", ...props }) { + return ( + + ); +} diff --git a/frontend/src/components/ui/button.jsx b/frontend/src/components/ui/button.jsx index 4c768ef35b05e72d38ca20c35639796857ab5084..e91c373aa7b0a2fc0689cc187bb4a28ef99da70a 100644 --- a/frontend/src/components/ui/button.jsx +++ b/frontend/src/components/ui/button.jsx @@ -1,38 +1,38 @@ -import React from "react"; -import { cn } from "@/lib/utils"; - -const base = - "inline-flex items-center justify-center whitespace-nowrap rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-indigo-500 focus-visible:ring-offset-2 disabled:opacity-50 disabled:pointer-events-none"; - -const variants = { - default: "bg-indigo-600 text-white hover:bg-indigo-700 shadow-sm", - outline: - "border border-slate-200 bg-white text-slate-900 hover:bg-slate-50", - ghost: "bg-transparent text-slate-700 hover:bg-slate-100", -}; - -const sizes = { - default: "h-10 px-4 py-2", - sm: "h-8 px-3 text-xs", - lg: "h-11 px-6 text-sm", - icon: "h-9 w-9", -}; - -export function Button({ - className, - variant = "default", - size = "default", - ...props -}) { - return ( - - ); -} - -export function DropdownMenuContent({ className, align = "end", ...props }) { - const { open } = useContext(DropdownContext); - if (!open) return null; - - const alignment = - align === "end" - ? "right-0 origin-top-right" - : align === "start" - ? "left-0 origin-top-left" - : "left-1/2 -translate-x-1/2 origin-top"; - - return ( -
- ); -} - -export function DropdownMenuItem({ className, onClick, ...props }) { - const { setOpen } = useContext(DropdownContext); - const handleClick = (e) => { - onClick?.(e); - setOpen(false); - }; - return ( -
- ); -} - -export function DropdownMenuSeparator({ className }) { - return ( -
- ); -} +import React, { + createContext, + useContext, + useState, + useRef, + useEffect, +} from "react"; +import { cn } from "@/lib/utils"; + +const DropdownContext = createContext(null); + +export function DropdownMenu({ children }) { + const [open, setOpen] = useState(false); + const triggerRef = useRef(null); + + // Close on outside click + useEffect(() => { + if (!open) return; + function handleClick(e) { + if (!triggerRef.current) return; + if (!triggerRef.current.parentElement.contains(e.target)) { + setOpen(false); + } + } + document.addEventListener("mousedown", handleClick); + return () => document.removeEventListener("mousedown", handleClick); + }, [open]); + + return ( + +
{children}
+
+ ); +} + +export function DropdownMenuTrigger({ asChild, children }) { + const { setOpen, triggerRef } = useContext(DropdownContext); + + const handleClick = (e) => { + e.stopPropagation(); + setOpen((o) => !o); + }; + + if (asChild && React.isValidElement(children)) { + return React.cloneElement(children, { + ref: triggerRef, + onClick: (e) => { + children.props.onClick?.(e); + handleClick(e); + }, + }); + } + + return ( + + ); +} + +export function DropdownMenuContent({ className, align = "end", ...props }) { + const { open } = useContext(DropdownContext); + if (!open) return null; + + const alignment = + align === "end" + ? "right-0 origin-top-right" + : align === "start" + ? "left-0 origin-top-left" + : "left-1/2 -translate-x-1/2 origin-top"; + + return ( +
+ ); +} + +export function DropdownMenuItem({ className, onClick, ...props }) { + const { setOpen } = useContext(DropdownContext); + const handleClick = (e) => { + onClick?.(e); + setOpen(false); + }; + return ( +
+ ); +} + +export function DropdownMenuSeparator({ className }) { + return ( +
+ ); +} diff --git a/frontend/src/components/ui/input.jsx b/frontend/src/components/ui/input.jsx index 9f30173f788c106889e378d11b46d4a6554818c1..319e9e14e5e2ed2c9ab73873caef3171e57b4d90 100644 --- a/frontend/src/components/ui/input.jsx +++ b/frontend/src/components/ui/input.jsx @@ -1,14 +1,14 @@ -import React from "react"; -import { cn } from "@/lib/utils"; - -export function Input({ className, ...props }) { - return ( - - ); -} +import React from "react"; +import { cn } from "@/lib/utils"; + +export function Input({ className, ...props }) { + return ( + + ); +} diff --git a/frontend/src/components/ui/select.jsx b/frontend/src/components/ui/select.jsx index 302601296df533d3810e2e8bb06874a56f8e2346..d1ddba83e564d01572d25dc96254726af905f14a 100644 --- a/frontend/src/components/ui/select.jsx +++ b/frontend/src/components/ui/select.jsx @@ -1,116 +1,116 @@ -import React, { - createContext, - useContext, - useState, - useRef, - useEffect, -} from "react"; -import { cn } from "@/lib/utils"; - -const SelectContext = createContext(null); - -export function Select({ value, onValueChange, children }) { - const [open, setOpen] = useState(false); - const [items, setItems] = useState({}); - const triggerRef = useRef(null); - - // Close on outside click - useEffect(() => { - if (!open) return; - function handleClick(e) { - if (!triggerRef.current) return; - if (!triggerRef.current.parentElement.contains(e.target)) { - setOpen(false); - } - } - document.addEventListener("mousedown", handleClick); - return () => document.removeEventListener("mousedown", handleClick); - }, [open]); - - const registerItem = (val, label) => { - setItems((prev) => ({ ...prev, [val]: label })); - }; - - return ( - -
{children}
-
- ); -} - -export function SelectTrigger({ className, children }) { - const { setOpen, triggerRef } = useContext(SelectContext); - return ( - - ); -} - -export function SelectValue({ placeholder }) { - const { value, items } = useContext(SelectContext); - const label = value ? items[value] : null; - return ( - - {label || placeholder} - - ); -} - -export function SelectContent({ className, children }) { - const { open } = useContext(SelectContext); - if (!open) return null; - return ( -
- {children} -
- ); -} - -export function SelectItem({ value, children, className }) { - const { onValueChange, setOpen, registerItem } = useContext(SelectContext); - - useEffect(() => { - registerItem(value, typeof children === "string" ? children : String(children)); - }, [value, children, registerItem]); - - const handleClick = () => { - onValueChange?.(value); - setOpen(false); - }; - - return ( -
- {children} -
- ); -} +import React, { + createContext, + useContext, + useState, + useRef, + useEffect, +} from "react"; +import { cn } from "@/lib/utils"; + +const SelectContext = createContext(null); + +export function Select({ value, onValueChange, children }) { + const [open, setOpen] = useState(false); + const [items, setItems] = useState({}); + const triggerRef = useRef(null); + + // Close on outside click + useEffect(() => { + if (!open) return; + function handleClick(e) { + if (!triggerRef.current) return; + if (!triggerRef.current.parentElement.contains(e.target)) { + setOpen(false); + } + } + document.addEventListener("mousedown", handleClick); + return () => document.removeEventListener("mousedown", handleClick); + }, [open]); + + const registerItem = (val, label) => { + setItems((prev) => ({ ...prev, [val]: label })); + }; + + return ( + +
{children}
+
+ ); +} + +export function SelectTrigger({ className, children }) { + const { setOpen, triggerRef } = useContext(SelectContext); + return ( + + ); +} + +export function SelectValue({ placeholder }) { + const { value, items } = useContext(SelectContext); + const label = value ? items[value] : null; + return ( + + {label || placeholder} + + ); +} + +export function SelectContent({ className, children }) { + const { open } = useContext(SelectContext); + if (!open) return null; + return ( +
+ {children} +
+ ); +} + +export function SelectItem({ value, children, className }) { + const { onValueChange, setOpen, registerItem } = useContext(SelectContext); + + useEffect(() => { + registerItem(value, typeof children === "string" ? children : String(children)); + }, [value, children, registerItem]); + + const handleClick = () => { + onValueChange?.(value); + setOpen(false); + }; + + return ( +
+ {children} +
+ ); +} diff --git a/frontend/src/components/ui/separator.jsx b/frontend/src/components/ui/separator.jsx new file mode 100644 index 0000000000000000000000000000000000000000..a7b216e4ca42a8f77e5c192082805484b81d6c45 --- /dev/null +++ b/frontend/src/components/ui/separator.jsx @@ -0,0 +1,16 @@ +import React from "react"; +import { cn } from "@/lib/utils"; + +export function Separator({ className, orientation = "horizontal", ...props }) { + return ( +
+ ); +} + diff --git a/frontend/src/components/ui/tabs.jsx b/frontend/src/components/ui/tabs.jsx index fa8e462074afde600ea995cdd496746a38e66c6e..23210ca7316fed05c6a6c1f1edee5be34a6ac765 100644 --- a/frontend/src/components/ui/tabs.jsx +++ b/frontend/src/components/ui/tabs.jsx @@ -1,45 +1,45 @@ -import React, { createContext, useContext } from "react"; -import { cn } from "@/lib/utils"; - -const TabsContext = createContext(null); - -export function Tabs({ value, onValueChange, children, className }) { - return ( - -
{children}
-
- ); -} - -export function TabsList({ className, ...props }) { - return ( -
- ); -} - -export function TabsTrigger({ value, className, children, ...props }) { - const ctx = useContext(TabsContext); - const selected = ctx?.value === value; - - return ( - - ); -} +import React, { createContext, useContext } from "react"; +import { cn } from "@/lib/utils"; + +const TabsContext = createContext(null); + +export function Tabs({ value, onValueChange, children, className }) { + return ( + +
{children}
+
+ ); +} + +export function TabsList({ className, ...props }) { + return ( +
+ ); +} + +export function TabsTrigger({ value, className, children, ...props }) { + const ctx = useContext(TabsContext); + const selected = ctx?.value === value; + + return ( + + ); +} diff --git a/frontend/src/config/firebase.js b/frontend/src/config/firebase.js new file mode 100644 index 0000000000000000000000000000000000000000..5531170fc724100f1892ae0127f860dee563233e --- /dev/null +++ b/frontend/src/config/firebase.js @@ -0,0 +1,30 @@ +/** + * Firebase configuration and initialization + */ +import { initializeApp } from 'firebase/app'; +import { getAuth, GoogleAuthProvider } from 'firebase/auth'; + +// Firebase configuration from environment variables +const firebaseConfig = { + apiKey: import.meta.env.VITE_FIREBASE_API_KEY, + authDomain: import.meta.env.VITE_FIREBASE_AUTH_DOMAIN, + projectId: import.meta.env.VITE_FIREBASE_PROJECT_ID, + storageBucket: import.meta.env.VITE_FIREBASE_STORAGE_BUCKET, + messagingSenderId: import.meta.env.VITE_FIREBASE_MESSAGING_SENDER_ID, + appId: import.meta.env.VITE_FIREBASE_APP_ID, +}; + +// Initialize Firebase +const app = initializeApp(firebaseConfig); + +// Initialize Firebase Authentication and get a reference to the service +export const auth = getAuth(app); + +// Configure Google Auth Provider +export const googleProvider = new GoogleAuthProvider(); +googleProvider.setCustomParameters({ + prompt: 'select_account' +}); + +export default app; + diff --git a/frontend/src/contexts/AuthContext.jsx b/frontend/src/contexts/AuthContext.jsx new file mode 100644 index 0000000000000000000000000000000000000000..d3ff473376ace5f4459eb2a2d0cc39d96a7e9111 --- /dev/null +++ b/frontend/src/contexts/AuthContext.jsx @@ -0,0 +1,115 @@ +import React, { createContext, useContext, useState, useEffect } from "react"; +import { signInWithPopup, signOut as firebaseSignOut } from "firebase/auth"; +import { auth, googleProvider } from "@/config/firebase"; +import { getCurrentUser, firebaseLogin, requestOTP, verifyOTP, logout as apiLogout } from "@/services/auth"; + +const AuthContext = createContext(null); + +export function AuthProvider({ children }) { + const [user, setUser] = useState(null); + const [loading, setLoading] = useState(true); + const [token, setToken] = useState(localStorage.getItem("auth_token")); + + useEffect(() => { + // Check if user is already authenticated + if (token) { + checkAuth(); + } else { + setLoading(false); + } + }, [token]); + + const checkAuth = async () => { + try { + const userData = await getCurrentUser(); + setUser(userData); + } catch (error) { + // Token is invalid, clear it + localStorage.removeItem("auth_token"); + setToken(null); + setUser(null); + } finally { + setLoading(false); + } + }; + + const handleFirebaseLogin = async () => { + try { + const result = await signInWithPopup(auth, googleProvider); + const idToken = await result.user.getIdToken(); + const response = await firebaseLogin(idToken); + handleAuthCallback(response.token); + } catch (error) { + if (error.code === 'auth/popup-closed' || error.code === 'auth/cancelled-popup-request') { + // User closed popup or cancelled - don't show error + return; + } + console.error("Firebase login error:", error); + throw new Error(error.message || "Firebase authentication failed"); + } + }; + + const handleOTPRequest = async (email) => { + try { + await requestOTP(email); + } catch (error) { + console.error("OTP request error:", error); + throw error; + } + }; + + const handleOTPVerify = async (email, otp) => { + try { + const response = await verifyOTP(email, otp); + handleAuthCallback(response.token); + } catch (error) { + console.error("OTP verify error:", error); + throw error; + } + }; + + const handleLogout = async () => { + try { + // Sign out from Firebase if user was using Firebase auth + if (auth.currentUser) { + await firebaseSignOut(auth); + } + await apiLogout(); + } catch (error) { + console.error("Logout error:", error); + } finally { + localStorage.removeItem("auth_token"); + setToken(null); + setUser(null); + } + }; + + const handleAuthCallback = (newToken) => { + localStorage.setItem("auth_token", newToken); + setToken(newToken); + checkAuth(); + }; + + const value = { + user, + token, + loading, + firebaseLogin: handleFirebaseLogin, + requestOTP: handleOTPRequest, + verifyOTP: handleOTPVerify, + logout: handleLogout, + handleAuthCallback, + isAuthenticated: !!user, + }; + + return {children}; +} + +export function useAuth() { + const context = useContext(AuthContext); + if (!context) { + throw new Error("useAuth must be used within an AuthProvider"); + } + return context; +} + diff --git a/frontend/src/index.css b/frontend/src/index.css index 6148f56cd77049444a4dda25d2845937903e6ff5..5c2d1fea68cc5fe6279e17861e8b90e6603553b6 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -1,7 +1,7 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -body { - font-family: system-ui, -apple-system, BlinkMacSystemFont, "SF Pro Text", sans-serif; -} +@tailwind base; +@tailwind components; +@tailwind utilities; + +body { + font-family: system-ui, -apple-system, BlinkMacSystemFont, "SF Pro Text", sans-serif; +} diff --git a/frontend/src/lib/utils.js b/frontend/src/lib/utils.js index 66be4a48aec4109f3a6c537c94b6ec6a51ed3788..c1a897eca739dec12a65d74d8edc6c922cde8fe6 100644 --- a/frontend/src/lib/utils.js +++ b/frontend/src/lib/utils.js @@ -1,3 +1,3 @@ -export function cn(...classes) { - return classes.filter(Boolean).join(" "); -} +export function cn(...classes) { + return classes.filter(Boolean).join(" "); +} diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx index 69f445b811296b4f7a21f246cedc5bb4ae6cebec..59c013f8acd12da2832c170e62141295f84bb8c9 100644 --- a/frontend/src/main.jsx +++ b/frontend/src/main.jsx @@ -1,16 +1,16 @@ -import React from "react"; -import ReactDOM from "react-dom/client"; -import { BrowserRouter } from "react-router-dom"; -import App from "./App"; -import ErrorBoundary from "./components/ErrorBoundary"; -import "./index.css"; - -ReactDOM.createRoot(document.getElementById("root")).render( - - - - - - - -); +import React from "react"; +import ReactDOM from "react-dom/client"; +import { BrowserRouter } from "react-router-dom"; +import App from "./App"; +import ErrorBoundary from "./components/ErrorBoundary"; +import "./index.css"; + +ReactDOM.createRoot(document.getElementById("root")).render( + + + + + + + +); diff --git a/frontend/src/pages/Dashboard.jsx b/frontend/src/pages/Dashboard.jsx index 40cfffc9ac4eae3a8e75ffa09cdef35db398680e..b4fa5dfe99c092e6ab525979efe9085f30619399 100644 --- a/frontend/src/pages/Dashboard.jsx +++ b/frontend/src/pages/Dashboard.jsx @@ -1,265 +1,476 @@ -// frontend/src/pages/Dashboard.jsx - -import React, { useState } from "react"; -import { motion } from "framer-motion"; -import { Sparkles, Zap, FileText, TrendingUp, Clock, AlertCircle } from "lucide-react"; -import { Button } from "@/components/ui/button"; -import UploadZone from "@/components/ocr/UploadZone"; -import DocumentPreview from "@/components/ocr/DocumentPreview"; -import ExtractionOutput from "@/components/ocr/ExtractionOutput"; -import ExportButtons from "@/components/ExportButtons"; -import ProcessingStatus from "@/components/ocr/ProcessingStatus"; -import { extractDocument } from "@/services/api"; - -export default function Dashboard() { - const [selectedFile, setSelectedFile] = useState(null); - const [isProcessing, setIsProcessing] = useState(false); - const [isComplete, setIsComplete] = useState(false); - const [extractionResult, setExtractionResult] = useState(null); - const [error, setError] = useState(null); - - const handleFileSelect = (file) => { - setSelectedFile(file); - setIsComplete(false); - setExtractionResult(null); - setError(null); - }; - - const handleClear = () => { - setSelectedFile(null); - setIsProcessing(false); - setIsComplete(false); - setExtractionResult(null); - setError(null); - }; - - const handleExtract = async () => { - if (!selectedFile) return; - - setIsProcessing(true); - setIsComplete(false); - setError(null); - setExtractionResult(null); - - try { - const result = await extractDocument(selectedFile); - setExtractionResult(result); - setIsComplete(true); - } catch (err) { - console.error("Extraction error:", err); - setError(err.message || "Failed to extract document. Please try again."); - setIsComplete(false); - } finally { - setIsProcessing(false); - } - }; - - return ( -
- {/* Header */} -
-
-
-

- Document Extraction -

-

- Upload any document and extract structured data with AI -

-
-
- {/* Stats Pills */} -
-
- - - 247 Extracted - -
-
- - - 98.5% Accuracy - -
-
- - -
-
-
- - {/* Main Content */} -
- {/* Upload Section */} - - - - {/* Extract Button */} - {selectedFile && !isProcessing && !isComplete && ( - - - - )} - - - {/* Error Message */} - {error && ( - -
- -
-

Extraction Failed

-

{error}

-
- -
-
- )} - - {/* Processing Status */} - {(isProcessing || isComplete) && ( -
- -
- )} - - {/* Split View */} - {selectedFile && ( - - - - - )} - - {/* Empty State Features */} - {!selectedFile && ( - -
-

- Powered by Advanced AI -

-

- Extract structured data from any document -

-
- -
- {[ - { - icon: Zap, - title: "Lightning Fast", - description: - "Process documents faster with our optimized AI pipeline", - color: "amber", - }, - { - icon: Sparkles, - title: "98.5% Accuracy", - description: - "Industry-leading extraction accuracy", - color: "indigo", - }, - { - icon: Clock, - title: "Any Format", - description: - "Support for PDF, images, spreadsheets, and scanned documents", - color: "emerald", - }, - ].map((feature, index) => ( - -
- -
-

- {feature.title} -

-

- {feature.description} -

-
- ))} -
- - {/* Supported Formats */} -
-

- Supported Formats -

-
- {["PDF", "PNG", "JPG", "TIFF", "DOCX", "XLSX"].map((format) => ( -
- - {format} -
- ))} -
-
-
- )} -
-
- ); -} +// frontend/src/pages/Dashboard.jsx + +import React, { useState, useEffect } from "react"; +import { useSearchParams } from "react-router-dom"; +import { motion } from "framer-motion"; +import { Sparkles, Zap, FileText, TrendingUp, Clock, AlertCircle } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import UploadZone from "@/components/ocr/UploadZone"; +import DocumentPreview from "@/components/ocr/DocumentPreview"; +import ExtractionOutput from "@/components/ocr/ExtractionOutput"; +import ExportButtons from "@/components/ExportButtons"; +import ProcessingStatus from "@/components/ocr/ProcessingStatus"; +import UpgradeModal from "@/components/ocr/UpgradeModal"; +import { extractDocument, getHistory, getExtractionById } from "@/services/api"; + +export default function Dashboard() { + const [searchParams, setSearchParams] = useSearchParams(); + const [selectedFile, setSelectedFile] = useState(null); + const [keyFields, setKeyFields] = useState(""); + const [isProcessing, setIsProcessing] = useState(false); + const [isComplete, setIsComplete] = useState(false); + const [extractionResult, setExtractionResult] = useState(null); + const [error, setError] = useState(null); + const [processingStage, setProcessingStage] = useState("received"); // received, analysis, extraction, done + const [stats, setStats] = useState({ totalExtracted: 0, averageAccuracy: 0 }); + const [isLoadingFromHistory, setIsLoadingFromHistory] = useState(false); + const [showUpgradeModal, setShowUpgradeModal] = useState(false); + + const TRIAL_LIMIT = 2; // Maximum number of extractions allowed in trial + + const handleFileSelect = (file) => { + // Check if user has reached trial limit + if (stats.totalExtracted >= TRIAL_LIMIT) { + setShowUpgradeModal(true); + return; + } + setSelectedFile(file); + setIsComplete(false); + setExtractionResult(null); + setError(null); + }; + + const handleClear = () => { + setSelectedFile(null); + setKeyFields(""); + setIsProcessing(false); + setIsComplete(false); + setExtractionResult(null); + setError(null); + setProcessingStage("received"); + }; + + // Load extraction from history if extractionId is in URL + useEffect(() => { + const extractionId = searchParams.get("extractionId"); + console.log("Dashboard useEffect - extractionId:", extractionId, "isLoadingFromHistory:", isLoadingFromHistory, "extractionResult:", extractionResult); + + if (extractionId && !isLoadingFromHistory) { + // Only load if we don't already have this extraction loaded + const currentExtractionId = extractionResult?.id; + if (currentExtractionId && currentExtractionId === parseInt(extractionId)) { + console.log("Extraction already loaded, skipping"); + return; + } + + const loadExtractionFromHistory = async () => { + setIsLoadingFromHistory(true); + setError(null); + try { + console.log("Loading extraction from history, ID:", extractionId); + const extraction = await getExtractionById(parseInt(extractionId)); + console.log("Extraction loaded:", extraction); + console.log("Extraction fields:", extraction.fields); + console.log("Fields type:", typeof extraction.fields); + console.log("Fields keys:", extraction.fields ? Object.keys(extraction.fields) : "none"); + + if (!extraction) { + throw new Error("No extraction data received"); + } + + // Ensure fields is an object, not a string + let fieldsData = extraction.fields || {}; + if (typeof fieldsData === 'string') { + try { + fieldsData = JSON.parse(fieldsData); + } catch (e) { + console.error("Failed to parse fields as JSON:", e); + fieldsData = {}; + } + } + + console.log("Processed fields:", fieldsData); + + // Create file object from base64 if available, otherwise create empty file + let fileForPreview; + if (extraction.fileBase64) { + // Convert base64 to binary + const binaryString = atob(extraction.fileBase64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + const fileBlob = new Blob([bytes], { type: extraction.fileType || "application/pdf" }); + fileForPreview = new File( + [fileBlob], + extraction.fileName || "document.pdf", + { type: extraction.fileType || "application/pdf" } + ); + console.log("Created file from base64:", fileForPreview.name, fileForPreview.size, "bytes"); + } else { + // Fallback: create empty file if base64 not available + const fileBlob = new Blob([], { type: extraction.fileType || "application/pdf" }); + fileForPreview = new File( + [fileBlob], + extraction.fileName || "document.pdf", + { type: extraction.fileType || "application/pdf" } + ); + console.log("No base64 available, created empty file"); + } + + // Set the extraction result - match the structure from extractDocument + const result = { + id: extraction.id, + fields: fieldsData, + confidence: extraction.confidence || 0, + fieldsExtracted: extraction.fieldsExtracted || 0, + totalTime: extraction.totalTime || 0, + fileName: extraction.fileName, + fileType: extraction.fileType, + fileSize: extraction.fileSize, + }; + + console.log("Setting extraction result:", result); + setExtractionResult(result); + setSelectedFile(fileForPreview); + setIsComplete(true); + setIsProcessing(false); + setProcessingStage("done"); + + // Remove the extractionId from URL + setSearchParams({}); + } catch (err) { + console.error("Failed to load extraction from history:", err); + const errorMessage = err.message || "Failed to load extraction from history"; + setError(errorMessage); + // Don't clear the URL params on error so user can see what went wrong + } finally { + setIsLoadingFromHistory(false); + } + }; + + loadExtractionFromHistory(); + } + }, [searchParams, isLoadingFromHistory, setSearchParams]); + + // Fetch and calculate stats from history + useEffect(() => { + const fetchStats = async () => { + try { + const history = await getHistory(); + + // Calculate total extracted (only completed extractions) + const completedExtractions = history.filter(item => item.status === "completed"); + const totalExtracted = completedExtractions.length; + + // Calculate average accuracy from completed extractions + const accuracies = completedExtractions + .map(item => item.confidence || 0) + .filter(acc => acc > 0); + + const averageAccuracy = accuracies.length > 0 + ? accuracies.reduce((sum, acc) => sum + acc, 0) / accuracies.length + : 0; + + setStats({ + totalExtracted, + averageAccuracy: Math.round(averageAccuracy * 10) / 10 // Round to 1 decimal place + }); + } catch (err) { + console.error("Failed to fetch stats:", err); + // Keep default values on error + } + }; + + // Fetch stats on mount and when extraction completes + fetchStats(); + }, [isComplete]); + + const handleExtract = async () => { + if (!selectedFile) return; + + // Check if user has reached trial limit before processing + if (stats.totalExtracted >= TRIAL_LIMIT) { + setShowUpgradeModal(true); + return; + } + + setIsProcessing(true); + setIsComplete(false); + setError(null); + setExtractionResult(null); + setProcessingStage("received"); + + // Move to Analysis stage immediately after starting + setTimeout(() => { + setProcessingStage("analysis"); + }, 100); + + // Move to Extraction stage after analysis phase (2.5 seconds) + let extractionTimer = setTimeout(() => { + setProcessingStage("extraction"); + }, 2500); + + try { + const result = await extractDocument(selectedFile, keyFields); + + // Clear the extraction timer + clearTimeout(extractionTimer); + + // Move to extraction stage if not already there, then to done + setProcessingStage("extraction"); + + // Small delay to show extraction stage, then move to done when results are rendered + setTimeout(() => { + setProcessingStage("done"); + setExtractionResult(result); + setIsComplete(true); + setIsProcessing(false); + }, 500); // Give time to see extraction stage + } catch (err) { + clearTimeout(extractionTimer); + console.error("Extraction error:", err); + setError(err.message || "Failed to extract document. Please try again."); + setIsComplete(false); + setProcessingStage("received"); + setIsProcessing(false); + } + }; + + return ( +
+ {/* Header */} +
+
+
+

+ Multi-Lingual Document Extraction +

+

+ Upload any document and extract structured data with VRP (No LLM) +

+
+
+ {/* Stats Pills */} +
+
+ + + {stats.totalExtracted}/{TRIAL_LIMIT} Used + +
+
+ + + {stats.averageAccuracy > 0 ? `${stats.averageAccuracy}%` : "0%"} Accuracy + +
+
+ + +
+
+
+ + {/* Main Content */} +
+ {/* Upload Section */} + + + + {/* Extract Button */} + {selectedFile && !isProcessing && !isComplete && ( + + + + )} + + + {/* Error Message */} + {error && ( + +
+ +
+

Extraction Failed

+

{error}

+
+ +
+
+ )} + + {/* Loading from History */} + {isLoadingFromHistory && ( + +
+ +
+

Loading extraction...

+

Retrieving extraction data from history

+
+
+
+ )} + + {/* Processing Status */} + {(isProcessing || isComplete) && !isLoadingFromHistory && ( +
+ +
+ )} + + {/* Split View */} + {selectedFile && ( + + + + + )} + + {/* Empty State Features */} + {!selectedFile && ( + +
+

+ Pure Agentic Document Intelligence +

+

+ Extract structured data from any document without LLM using VRP (Visual Resoning Processor) +

+
+ +
+ {[ + { + icon: Zap, + title: "Lightning Fast", + description: + "Process documents faster with our agentic pipeline", + color: "amber", + }, + { + icon: Sparkles, + title: `${stats.averageAccuracy > 0 ? stats.averageAccuracy : "99.8"}% Accuracy`, + description: + "Industry-leading extraction accuracy", + color: "indigo", + }, + { + icon: Clock, + title: "Any Format", + description: + "Support for PDF, images, and scanned documents", + color: "emerald", + }, + ].map((feature, index) => ( + +
+ +
+

+ {feature.title} +

+

+ {feature.description} +

+
+ ))} +
+ + {/* Supported Formats */} +
+

+ Supported Formats +

+
+ {["PDF", "PNG", "JPG", "TIFF", "JPEG"].map((format) => ( +
+ + {format} +
+ ))} +
+
+
+ )} +
+ + {/* Upgrade Modal */} + setShowUpgradeModal(false)} /> +
+ ); +} diff --git a/frontend/src/pages/History.jsx b/frontend/src/pages/History.jsx index a090645052336c56ef6cd89dc0653fb6ba210abc..4b189d4271e0bfbc62b063979ff187807201aaab 100644 --- a/frontend/src/pages/History.jsx +++ b/frontend/src/pages/History.jsx @@ -1,839 +1,859 @@ -// frontend/src/pages/History.jsx - -import React, { useState, useEffect } from "react"; -import { motion, AnimatePresence } from "framer-motion"; -import { - FileText, - Clock, - CheckCircle2, - ChevronRight, - Download, - Eye, - Trash2, - Search, - Filter, - Calendar, - Upload, - Cpu, - TableProperties, - MonitorPlay, - TrendingUp, - TrendingDown, - Minus, - AlertCircle, - X, - FileSpreadsheet, - Table2, -} from "lucide-react"; -import { Button } from "@/components/ui/button"; -import { Input } from "@/components/ui/input"; -import { Badge } from "@/components/ui/badge"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; -import { - DropdownMenu, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuSeparator, - DropdownMenuTrigger, -} from "@/components/ui/dropdown-menu"; -import { cn } from "@/lib/utils"; -import { getHistory } from "@/services/api"; - -// minimal "toast" -const toastSuccess = (msg) => { - console.log(msg); -}; - -const stageConfig = { - uploading: { label: "Uploading", icon: Upload, color: "blue" }, - aiAnalysis: { label: "AI Analysis", icon: Cpu, color: "violet" }, - dataExtraction: { label: "Data Extraction", icon: TableProperties, color: "emerald" }, - outputRendering: { label: "Output Rendering", icon: MonitorPlay, color: "amber" }, -}; - -const variationConfig = { - fast: { icon: TrendingDown, color: "text-emerald-500", label: "Faster than avg" }, - normal: { icon: Minus, color: "text-slate-400", label: "Normal" }, - slow: { icon: TrendingUp, color: "text-amber-500", label: "Slower than avg" }, - error: { icon: AlertCircle, color: "text-red-500", label: "Error" }, - skipped: { icon: Minus, color: "text-slate-300", label: "Skipped" }, -}; - -export default function History() { - const [searchQuery, setSearchQuery] = useState(""); - const [selectedStatus, setSelectedStatus] = useState("all"); - const [expandedReport, setExpandedReport] = useState(null); - const [isExporting, setIsExporting] = useState(false); - const [history, setHistory] = useState([]); - const [isLoading, setIsLoading] = useState(true); - const [error, setError] = useState(null); - - // Fetch history on component mount - useEffect(() => { - const fetchHistory = async () => { - setIsLoading(true); - setError(null); - try { - const data = await getHistory(); - setHistory(data); - } catch (err) { - console.error("Failed to fetch history:", err); - setError(err.message || "Failed to load history"); - setHistory([]); // Fallback to empty array - } finally { - setIsLoading(false); - } - }; - - fetchHistory(); - }, []); - - const filteredHistory = history.filter((item) => { - const matchesSearch = item.fileName?.toLowerCase().includes(searchQuery.toLowerCase()) ?? false; - const matchesStatus = selectedStatus === "all" || item.status === selectedStatus; - return matchesSearch && matchesStatus; - }); - - const formatTime = (ms) => { - if (ms >= 1000) { - return `${(ms / 1000).toFixed(2)}s`; - } - return `${ms}ms`; - }; - - const formatTimeForExport = (ms) => { - return ms >= 1000 ? `${(ms / 1000).toFixed(2)}s` : `${ms}ms`; - }; - - const formatDate = (dateString) => { - const date = new Date(dateString); - return date.toLocaleDateString("en-US", { - month: "short", - day: "numeric", - hour: "2-digit", - minute: "2-digit", - }); - }; - - const formatDateForExport = (dateString) => { - const date = new Date(dateString); - return date.toISOString().replace("T", " ").slice(0, 19); - }; - - const generateCSV = (data) => { - const headers = [ - "File Name", - "File Type", - "File Size", - "Extracted At", - "Status", - "Confidence (%)", - "Fields Extracted", - "Total Time (ms)", - "Upload Time (ms)", - "Upload Status", - "Upload Variation", - "AI Analysis Time (ms)", - "AI Analysis Status", - "AI Analysis Variation", - "Data Extraction Time (ms)", - "Data Extraction Status", - "Data Extraction Variation", - "Output Rendering Time (ms)", - "Output Rendering Status", - "Output Rendering Variation", - "Error Message", - ]; - - const rows = data.map((item) => [ - item.fileName, - item.fileType, - item.fileSize, - formatDateForExport(item.extractedAt), - item.status, - item.confidence, - item.fieldsExtracted, - item.totalTime, - item.stages.uploading.time, - item.stages.uploading.status, - item.stages.uploading.variation, - item.stages.aiAnalysis.time, - item.stages.aiAnalysis.status, - item.stages.aiAnalysis.variation, - item.stages.dataExtraction.time, - item.stages.dataExtraction.status, - item.stages.dataExtraction.variation, - item.stages.outputRendering.time, - item.stages.outputRendering.status, - item.stages.outputRendering.variation, - item.errorMessage || "", - ]); - - const csvContent = [ - headers.join(","), - ...rows.map((row) => row.map((cell) => `"${cell}"`).join(",")), - ].join("\n"); - - return csvContent; - }; - - const downloadFile = (content, fileName, mimeType) => { - const blob = new Blob([content], { type: mimeType }); - const url = URL.createObjectURL(blob); - const link = document.createElement("a"); - link.href = url; - link.download = fileName; - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(url); - }; - - const handleExportCSV = () => { - setIsExporting(true); - setTimeout(() => { - const csvContent = generateCSV(filteredHistory); - downloadFile( - csvContent, - `extraction_history_${new Date().toISOString().slice(0, 10)}.csv`, - "text/csv;charset=utf-8;" - ); - toastSuccess("CSV exported successfully"); - setIsExporting(false); - }, 500); - }; - - const generateExcelXML = (data) => { - const headers = [ - "File Name", - "File Type", - "File Size", - "Extracted At", - "Status", - "Confidence (%)", - "Fields Extracted", - "Total Time (ms)", - "Upload Time (ms)", - "Upload Status", - "Upload Variation", - "AI Analysis Time (ms)", - "AI Analysis Status", - "AI Analysis Variation", - "Data Extraction Time (ms)", - "Data Extraction Status", - "Data Extraction Variation", - "Output Rendering Time (ms)", - "Output Rendering Status", - "Output Rendering Variation", - "Error Message", - ]; - - const rows = data.map((item) => [ - item.fileName, - item.fileType, - item.fileSize, - formatDateForExport(item.extractedAt), - item.status, - item.confidence, - item.fieldsExtracted, - item.totalTime, - item.stages.uploading.time, - item.stages.uploading.status, - item.stages.uploading.variation, - item.stages.aiAnalysis.time, - item.stages.aiAnalysis.status, - item.stages.aiAnalysis.variation, - item.stages.dataExtraction.time, - item.stages.dataExtraction.status, - item.stages.dataExtraction.variation, - item.stages.outputRendering.time, - item.stages.outputRendering.status, - item.stages.outputRendering.variation, - item.errorMessage || "", - ]); - - let xml = ` - - - - -`; - - headers.forEach((header) => { - xml += `${header}`; - }); - xml += ``; - - rows.forEach((row) => { - xml += ``; - row.forEach((cell) => { - const type = typeof cell === "number" ? "Number" : "String"; - xml += `${cell}`; - }); - xml += ``; - }); - - xml += `
`; - return xml; - }; - - const handleExportExcel = () => { - setIsExporting(true); - setTimeout(() => { - const excelContent = generateExcelXML(filteredHistory); - downloadFile( - excelContent, - `extraction_history_${new Date().toISOString().slice(0, 10)}.xls`, - "application/vnd.ms-excel" - ); - toastSuccess("Excel file exported successfully"); - setIsExporting(false); - }, 500); - }; - - const handleExportSingleReport = (item, format) => { - if (format === "csv") { - const csvContent = generateCSV([item]); - downloadFile( - csvContent, - `${item.fileName.replace(/\.[^/.]+$/, "")}_report.csv`, - "text/csv;charset=utf-8;" - ); - toastSuccess("Report exported as CSV"); - } else { - const excelContent = generateExcelXML([item]); - downloadFile( - excelContent, - `${item.fileName.replace(/\.[^/.]+$/, "")}_report.xls`, - "application/vnd.ms-excel" - ); - toastSuccess("Report exported as Excel"); - } - }; - - return ( -
- {/* Header */} -
-
-

- Extraction History -

-

- View detailed reports and performance metrics for all extractions -

-
-
- - {/* Content */} -
- {/* Filters */} -
-
- - setSearchQuery(e.target.value)} - className="pl-10 h-11 rounded-xl border-slate-200" - /> -
- - - {/* Export All Button */} - - - - - - - - Export as CSV - - - - Export as Excel - - -
- {filteredHistory.length} records will be exported -
-
-
-
- - {/* Stats Overview */} -
- {(() => { - const total = history.length; - const completed = history.filter((h) => h.status === "completed").length; - const successRate = total > 0 ? ((completed / total) * 100).toFixed(1) : 0; - const avgTime = history.length > 0 - ? history.reduce((sum, h) => sum + (h.totalTime || 0), 0) / history.length - : 0; - const totalFields = history.reduce((sum, h) => sum + (h.fieldsExtracted || 0), 0); - - return [ - { - label: "Total Extractions", - value: total.toString(), - change: "", - color: "indigo", - }, - { - label: "Success Rate", - value: `${successRate}%`, - change: total > 0 ? `${completed}/${total} successful` : "No data", - color: "emerald", - }, - { - label: "Avg. Processing Time", - value: avgTime >= 1000 ? `${(avgTime / 1000).toFixed(1)}s` : `${Math.round(avgTime)}ms`, - change: "", - color: "violet", - }, - { - label: "Fields Extracted", - value: totalFields.toLocaleString(), - change: "", - color: "amber", - }, - ].map((stat, index) => ( - -

{stat.label}

-

{stat.value}

-

- {stat.change} -

-
- )); - })()} -
- - {/* Loading State */} - {isLoading && ( -
- - - -

Loading extraction history...

-
- )} - - {/* History List */} - {!isLoading && ( -
- {filteredHistory.map((item, index) => ( - - {/* Main Row */} -
- setExpandedReport( - expandedReport === item.id ? null : item.id - ) - } - > -
- {/* File Icon */} -
- -
- - {/* File Info */} -
-
-

- {item.fileName} -

- - {item.fileType} - -
-
- {item.fileSize} - - - {formatDate(item.extractedAt)} - -
-
- - {/* Stats */} -
-
-

Time

-

- {formatTime(item.totalTime)} -

-
-
-

Fields

-

- {item.fieldsExtracted} -

-
-
-

Confidence

-

= 95 - ? "text-emerald-600" - : item.confidence >= 90 - ? "text-amber-600" - : "text-red-600" - )} - > - {item.confidence > 0 ? `${item.confidence}%` : "-"} -

-
-
- - {/* Status & Actions */} -
- - {item.status === "completed" ? ( - - ) : ( - - )} - {item.status} - - -
-
-
- - {/* Expanded Report */} - - {expandedReport === item.id && ( - -
- {/* Error Message */} - {item.errorMessage && ( -
-
- - Error Details -
-

- {item.errorMessage} -

-
- )} - - {/* Performance Report Header */} -
-

- Performance Report -

-
- - - - - - - { - e.stopPropagation(); - handleExportSingleReport(item, "csv"); - }} - > - - Download CSV - - { - e.stopPropagation(); - handleExportSingleReport(item, "excel"); - }} - > - - Download Excel - - - -
-
- - {/* Stage Timing Cards */} -
- {Object.entries(item.stages).map( - ([stageKey, stageData]) => { - const config = stageConfig[stageKey]; - const variationInfo = - variationConfig[stageData.variation]; - const Icon = config.icon; - const VariationIcon = variationInfo.icon; - - return ( -
-
-
- -
- - {config.label} - -
- -
-
-

- {stageData.status === "skipped" - ? "-" - : formatTime(stageData.time)} -

- {stageData.status !== "skipped" && ( -
- - - {variationInfo.label} - -
- )} -
- - {stageData.status === "completed" && ( - - )} - {stageData.status === "failed" && ( - - )} -
- - {/* Progress bar */} -
- -
-
- ); - } - )} -
- - {/* Total Time Summary */} -
-
- -
-

- Total Processing Time -

-

- From upload to output ready -

-
-
-
-

- {formatTime(item.totalTime)} -

-

- {item.status === "completed" - ? "Completed successfully" - : "Process failed"} -

-
-
-
-
- )} -
-
- ))} - {filteredHistory.length === 0 && !error && ( -
-
- -
-

- {history.length === 0 - ? "No extraction history yet" - : "No extractions match your filters"} -

- {history.length === 0 && ( -

- Upload a document to get started -

- )} -
- )} -
- )} -
-
- ); -} +// frontend/src/pages/History.jsx + +import React, { useState, useEffect } from "react"; +import { useNavigate, useSearchParams } from "react-router-dom"; +import { motion, AnimatePresence } from "framer-motion"; +import { + FileText, + Clock, + CheckCircle2, + ChevronRight, + Download, + Eye, + Trash2, + Search, + Filter, + Calendar, + Upload, + Cpu, + TableProperties, + MonitorPlay, + TrendingUp, + TrendingDown, + Minus, + AlertCircle, + X, + FileSpreadsheet, + Table2, +} from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Badge } from "@/components/ui/badge"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuSeparator, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; +import { cn } from "@/lib/utils"; +import { getHistory } from "@/services/api"; + +// minimal "toast" +const toastSuccess = (msg) => { + console.log(msg); +}; + +const stageConfig = { + uploading: { label: "Uploading", icon: Upload, color: "blue" }, + aiAnalysis: { label: "AI Analysis", icon: Cpu, color: "violet" }, + dataExtraction: { label: "Data Extraction", icon: TableProperties, color: "emerald" }, + outputRendering: { label: "Output Rendering", icon: MonitorPlay, color: "amber" }, +}; + +const variationConfig = { + fast: { icon: TrendingDown, color: "text-emerald-500", label: "Faster than avg" }, + normal: { icon: Minus, color: "text-slate-400", label: "Normal" }, + slow: { icon: TrendingUp, color: "text-amber-500", label: "Slower than avg" }, + error: { icon: AlertCircle, color: "text-red-500", label: "Error" }, + skipped: { icon: Minus, color: "text-slate-300", label: "Skipped" }, +}; + +export default function History() { + const navigate = useNavigate(); + const [searchParams, setSearchParams] = useSearchParams(); + const [searchQuery, setSearchQuery] = useState(""); + const [selectedStatus, setSelectedStatus] = useState("all"); + const [expandedReport, setExpandedReport] = useState(null); + const [isExporting, setIsExporting] = useState(false); + const [history, setHistory] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + // Fetch history on component mount + useEffect(() => { + const fetchHistory = async () => { + setIsLoading(true); + setError(null); + try { + const data = await getHistory(); + setHistory(data); + + // Check if there's an extractionId in URL (from share link) + const extractionId = searchParams.get("extractionId"); + if (extractionId) { + // Clear the query param and navigate to dashboard + setSearchParams({}); + // Small delay to ensure history is loaded + setTimeout(() => { + navigate(`/?extractionId=${extractionId}`); + }, 100); + } + } catch (err) { + console.error("Failed to fetch history:", err); + setError(err.message || "Failed to load history"); + setHistory([]); // Fallback to empty array + } finally { + setIsLoading(false); + } + }; + + fetchHistory(); + }, [searchParams, setSearchParams, navigate]); + + const filteredHistory = history.filter((item) => { + const matchesSearch = item.fileName?.toLowerCase().includes(searchQuery.toLowerCase()) ?? false; + const matchesStatus = selectedStatus === "all" || item.status === selectedStatus; + return matchesSearch && matchesStatus; + }); + + const formatTime = (ms) => { + if (ms >= 1000) { + return `${(ms / 1000).toFixed(2)}s`; + } + return `${ms}ms`; + }; + + const formatTimeForExport = (ms) => { + return ms >= 1000 ? `${(ms / 1000).toFixed(2)}s` : `${ms}ms`; + }; + + const formatDate = (dateString) => { + const date = new Date(dateString); + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + }); + }; + + const formatDateForExport = (dateString) => { + const date = new Date(dateString); + return date.toISOString().replace("T", " ").slice(0, 19); + }; + + const generateCSV = (data) => { + const headers = [ + "File Name", + "File Type", + "File Size", + "Extracted At", + "Status", + "Confidence (%)", + "Fields Extracted", + "Total Time (ms)", + "Upload Time (ms)", + "Upload Status", + "Upload Variation", + "AI Analysis Time (ms)", + "AI Analysis Status", + "AI Analysis Variation", + "Data Extraction Time (ms)", + "Data Extraction Status", + "Data Extraction Variation", + "Output Rendering Time (ms)", + "Output Rendering Status", + "Output Rendering Variation", + "Error Message", + ]; + + const rows = data.map((item) => [ + item.fileName, + item.fileType, + item.fileSize, + formatDateForExport(item.extractedAt), + item.status, + item.confidence, + item.fieldsExtracted, + item.totalTime, + item.stages.uploading.time, + item.stages.uploading.status, + item.stages.uploading.variation, + item.stages.aiAnalysis.time, + item.stages.aiAnalysis.status, + item.stages.aiAnalysis.variation, + item.stages.dataExtraction.time, + item.stages.dataExtraction.status, + item.stages.dataExtraction.variation, + item.stages.outputRendering.time, + item.stages.outputRendering.status, + item.stages.outputRendering.variation, + item.errorMessage || "", + ]); + + const csvContent = [ + headers.join(","), + ...rows.map((row) => row.map((cell) => `"${cell}"`).join(",")), + ].join("\n"); + + return csvContent; + }; + + const downloadFile = (content, fileName, mimeType) => { + const blob = new Blob([content], { type: mimeType }); + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = fileName; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(url); + }; + + const handleExportCSV = () => { + setIsExporting(true); + setTimeout(() => { + const csvContent = generateCSV(filteredHistory); + downloadFile( + csvContent, + `extraction_history_${new Date().toISOString().slice(0, 10)}.csv`, + "text/csv;charset=utf-8;" + ); + toastSuccess("CSV exported successfully"); + setIsExporting(false); + }, 500); + }; + + const generateExcelXML = (data) => { + const headers = [ + "File Name", + "File Type", + "File Size", + "Extracted At", + "Status", + "Confidence (%)", + "Fields Extracted", + "Total Time (ms)", + "Upload Time (ms)", + "Upload Status", + "Upload Variation", + "AI Analysis Time (ms)", + "AI Analysis Status", + "AI Analysis Variation", + "Data Extraction Time (ms)", + "Data Extraction Status", + "Data Extraction Variation", + "Output Rendering Time (ms)", + "Output Rendering Status", + "Output Rendering Variation", + "Error Message", + ]; + + const rows = data.map((item) => [ + item.fileName, + item.fileType, + item.fileSize, + formatDateForExport(item.extractedAt), + item.status, + item.confidence, + item.fieldsExtracted, + item.totalTime, + item.stages.uploading.time, + item.stages.uploading.status, + item.stages.uploading.variation, + item.stages.aiAnalysis.time, + item.stages.aiAnalysis.status, + item.stages.aiAnalysis.variation, + item.stages.dataExtraction.time, + item.stages.dataExtraction.status, + item.stages.dataExtraction.variation, + item.stages.outputRendering.time, + item.stages.outputRendering.status, + item.stages.outputRendering.variation, + item.errorMessage || "", + ]); + + let xml = ` + + + + +`; + + headers.forEach((header) => { + xml += `${header}`; + }); + xml += ``; + + rows.forEach((row) => { + xml += ``; + row.forEach((cell) => { + const type = typeof cell === "number" ? "Number" : "String"; + xml += `${cell}`; + }); + xml += ``; + }); + + xml += `
`; + return xml; + }; + + const handleExportExcel = () => { + setIsExporting(true); + setTimeout(() => { + const excelContent = generateExcelXML(filteredHistory); + downloadFile( + excelContent, + `extraction_history_${new Date().toISOString().slice(0, 10)}.xls`, + "application/vnd.ms-excel" + ); + toastSuccess("Excel file exported successfully"); + setIsExporting(false); + }, 500); + }; + + const handleExportSingleReport = (item, format) => { + if (format === "csv") { + const csvContent = generateCSV([item]); + downloadFile( + csvContent, + `${item.fileName.replace(/\.[^/.]+$/, "")}_report.csv`, + "text/csv;charset=utf-8;" + ); + toastSuccess("Report exported as CSV"); + } else { + const excelContent = generateExcelXML([item]); + downloadFile( + excelContent, + `${item.fileName.replace(/\.[^/.]+$/, "")}_report.xls`, + "application/vnd.ms-excel" + ); + toastSuccess("Report exported as Excel"); + } + }; + + return ( +
+ {/* Header */} +
+
+
+

+ Extraction History +

+

+ View detailed reports and performance metrics for all extractions +

+
+
+
+ + {/* Content */} +
+ {/* Filters */} +
+
+ + setSearchQuery(e.target.value)} + className="pl-10 h-11 rounded-xl border-slate-200" + /> +
+ + + {/* Export All Button */} + + + + + + + + Export as CSV + + + + Export as Excel + + +
+ {filteredHistory.length} records will be exported +
+
+
+
+ + {/* Stats Overview */} +
+ {(() => { + const total = history.length; + const completed = history.filter((h) => h.status === "completed").length; + const successRate = total > 0 ? ((completed / total) * 100).toFixed(1) : 0; + const avgTime = history.length > 0 + ? history.reduce((sum, h) => sum + (h.totalTime || 0), 0) / history.length + : 0; + const totalFields = history.reduce((sum, h) => sum + (h.fieldsExtracted || 0), 0); + + return [ + { + label: "Total Extractions", + value: total.toString(), + change: "", + color: "indigo", + }, + { + label: "Success Rate", + value: `${successRate}%`, + change: total > 0 ? `${completed}/${total} successful` : "No data", + color: "emerald", + }, + { + label: "Avg. Processing Time", + value: avgTime >= 1000 ? `${(avgTime / 1000).toFixed(1)}s` : `${Math.round(avgTime)}ms`, + change: "", + color: "violet", + }, + { + label: "Fields Extracted", + value: totalFields.toLocaleString(), + change: "", + color: "amber", + }, + ].map((stat, index) => ( + +

{stat.label}

+

{stat.value}

+

+ {stat.change} +

+
+ )); + })()} +
+ + {/* Loading State */} + {isLoading && ( +
+ + + +

Loading extraction history...

+
+ )} + + {/* History List */} + {!isLoading && ( +
+ {filteredHistory.map((item, index) => ( + + {/* Main Row */} +
+ setExpandedReport( + expandedReport === item.id ? null : item.id + ) + } + > +
+ {/* File Icon */} +
+ +
+ + {/* File Info */} +
+
+

+ {item.fileName} +

+ + {item.fileType} + +
+
+ {item.fileSize} + + + {formatDate(item.extractedAt)} + +
+
+ + {/* Stats */} +
+
+

Time

+

+ {formatTime(item.totalTime)} +

+
+
+

Fields

+

+ {item.fieldsExtracted} +

+
+
+

Confidence

+

= 95 + ? "text-emerald-600" + : item.confidence >= 90 + ? "text-amber-600" + : "text-red-600" + )} + > + {item.confidence > 0 ? `${item.confidence}%` : "-"} +

+
+
+ + {/* Status & Actions */} +
+ + {item.status === "completed" ? ( + + ) : ( + + )} + {item.status} + + +
+
+
+ + {/* Expanded Report */} + + {expandedReport === item.id && ( + +
+ {/* Error Message */} + {item.errorMessage && ( +
+
+ + Error Details +
+

+ {item.errorMessage} +

+
+ )} + + {/* Performance Report Header */} +
+

+ Performance Report +

+
+ + + + + + + { + e.stopPropagation(); + handleExportSingleReport(item, "csv"); + }} + > + + Download CSV + + { + e.stopPropagation(); + handleExportSingleReport(item, "excel"); + }} + > + + Download Excel + + + +
+
+ + {/* Stage Timing Cards */} +
+ {Object.entries(item.stages).map( + ([stageKey, stageData]) => { + const config = stageConfig[stageKey]; + const variationInfo = + variationConfig[stageData.variation]; + const Icon = config.icon; + const VariationIcon = variationInfo.icon; + + return ( +
+
+
+ +
+ + {config.label} + +
+ +
+
+

+ {stageData.status === "skipped" + ? "-" + : formatTime(stageData.time)} +

+ {stageData.status !== "skipped" && ( +
+ + + {variationInfo.label} + +
+ )} +
+ + {stageData.status === "completed" && ( + + )} + {stageData.status === "failed" && ( + + )} +
+ + {/* Progress bar */} +
+ +
+
+ ); + } + )} +
+ + {/* Total Time Summary */} +
+
+ +
+

+ Total Processing Time +

+

+ From upload to output ready +

+
+
+
+

+ {formatTime(item.totalTime)} +

+

+ {item.status === "completed" + ? "Completed successfully" + : "Process failed"} +

+
+
+
+
+ )} +
+
+ ))} + {filteredHistory.length === 0 && !error && ( +
+
+ +
+

+ {history.length === 0 + ? "No extraction history yet" + : "No extractions match your filters"} +

+ {history.length === 0 && ( +

+ Upload a document to get started +

+ )} +
+ )} +
+ )} +
+
+ ); +} diff --git a/frontend/src/pages/ShareHandler.jsx b/frontend/src/pages/ShareHandler.jsx new file mode 100644 index 0000000000000000000000000000000000000000..f2d1024a72c9243a65c79746f2dfd9c24f03ac4e --- /dev/null +++ b/frontend/src/pages/ShareHandler.jsx @@ -0,0 +1,95 @@ +import React, { useEffect, useState } from "react"; +import { useParams, useNavigate } from "react-router-dom"; +import { useAuth } from "@/contexts/AuthContext"; +import { accessSharedExtraction } from "@/services/api"; +import LoginForm from "@/components/auth/LoginForm"; + +export default function ShareHandler() { + const { token } = useParams(); + const navigate = useNavigate(); + const { isAuthenticated, loading } = useAuth(); + const [isProcessing, setIsProcessing] = useState(false); + const [error, setError] = useState(null); + + useEffect(() => { + const processShare = async () => { + if (loading) return; // Wait for auth to load + + if (!isAuthenticated) { + // User not logged in - they'll be shown login form + // After login, AuthContext will trigger a re-render and this will run again + return; + } + + // User is authenticated, process the share + if (isProcessing) return; // Prevent duplicate calls + setIsProcessing(true); + setError(null); + + try { + const result = await accessSharedExtraction(token); + if (result.success && result.extraction_id) { + // Redirect to history page with the extraction ID + navigate(`/history?extractionId=${result.extraction_id}`); + } else { + setError("Failed to access shared extraction"); + } + } catch (err) { + console.error("Share access error:", err); + setError(err.message || "Failed to access shared extraction"); + // Still redirect to history after 3 seconds + setTimeout(() => { + navigate("/history"); + }, 3000); + } finally { + setIsProcessing(false); + } + }; + + processShare(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [token, isAuthenticated, loading]); + + // Show login form if not authenticated + if (!isAuthenticated && !loading) { + return ; + } + + // Show loading state while processing + if (isProcessing || loading) { + return ( +
+
+
+
+
+

Loading shared extraction...

+
+
+ ); + } + + // Show error state + if (error) { + return ( +
+
+
+
+
+

Error

+

{error}

+ +
+
+ ); + } + + return null; +} + diff --git a/frontend/src/services/api.js b/frontend/src/services/api.js index 89c4f7bd181c6cc35313ab1a4880659ce09782d6..6ded22f35a65dbdd9f74fb305b5c4e4b3ff6a3b2 100644 --- a/frontend/src/services/api.js +++ b/frontend/src/services/api.js @@ -1,59 +1,173 @@ -/** - * API service for communicating with the FastAPI backend - */ - -const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || ""; - -/** - * Extract data from a document - * @param {File} file - The file to extract data from - * @returns {Promise} Extraction result with fields, confidence, etc. - */ -export async function extractDocument(file) { - const formData = new FormData(); - formData.append("file", file); - - const response = await fetch(`${API_BASE_URL}/api/extract`, { - method: "POST", - body: formData, - }); - - if (!response.ok) { - const errorData = await response.json().catch(() => ({ - error: `HTTP ${response.status}: ${response.statusText}`, - })); - throw new Error(errorData.error || errorData.detail || "Extraction failed"); - } - - return await response.json(); -} - -/** - * Get extraction history - * @returns {Promise} Array of extraction records - */ -export async function getHistory() { - const response = await fetch(`${API_BASE_URL}/api/history`); - - if (!response.ok) { - const errorData = await response.json().catch(() => ({ - error: `HTTP ${response.status}: ${response.statusText}`, - })); - throw new Error(errorData.error || errorData.detail || "Failed to fetch history"); - } - - return await response.json(); -} - -/** - * Health check endpoint - * @returns {Promise} Status object - */ -export async function ping() { - const response = await fetch(`${API_BASE_URL}/ping`); - if (!response.ok) { - throw new Error("Backend is not available"); - } - return await response.json(); -} - +/** + * API service for communicating with the FastAPI backend + */ + +const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || ""; + +/** + * Get authorization headers with token + */ +function getAuthHeaders() { + const token = localStorage.getItem("auth_token"); + return token ? { Authorization: `Bearer ${token}` } : {}; +} + +/** + * Extract data from a document + * @param {File} file - The file to extract data from + * @param {string} keyFields - Optional comma-separated list of fields to extract + * @returns {Promise} Extraction result with fields, confidence, etc. + */ +export async function extractDocument(file, keyFields = "") { + const formData = new FormData(); + formData.append("file", file); + if (keyFields && keyFields.trim()) { + formData.append("key_fields", keyFields.trim()); + } + + const response = await fetch(`${API_BASE_URL}/api/extract`, { + method: "POST", + headers: getAuthHeaders(), + body: formData, + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Extraction failed"); + } + + return await response.json(); +} + +/** + * Get extraction history + * @returns {Promise} Array of extraction records + */ +export async function getHistory() { + const response = await fetch(`${API_BASE_URL}/api/history`, { + headers: getAuthHeaders(), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Failed to fetch history"); + } + + return await response.json(); +} + +/** + * Get a specific extraction by ID with full fields data + * @param {number} extractionId - The extraction ID + * @returns {Promise} Extraction result with fields + */ +export async function getExtractionById(extractionId) { + const response = await fetch(`${API_BASE_URL}/api/extraction/${extractionId}`, { + headers: getAuthHeaders(), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Failed to fetch extraction"); + } + + return await response.json(); +} + +/** + * Create a shareable link for an extraction + * @param {number} extractionId - The extraction ID to share + * @returns {Promise} Share link result with share_link + */ +export async function createShareLink(extractionId) { + const response = await fetch(`${API_BASE_URL}/api/share/link`, { + method: "POST", + headers: { + "Content-Type": "application/json", + ...getAuthHeaders(), + }, + body: JSON.stringify({ + extraction_id: extractionId, + }), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Failed to create share link"); + } + + return await response.json(); +} + +/** + * Share an extraction with another user(s) + * @param {number} extractionId - The extraction ID to share + * @param {string|string[]} recipientEmails - Recipient email address(es) - can be a single email or array of emails + * @returns {Promise} Share result + */ +export async function shareExtraction(extractionId, recipientEmails) { + // Ensure recipient_emails is always an array + const emailsArray = Array.isArray(recipientEmails) ? recipientEmails : [recipientEmails]; + + const response = await fetch(`${API_BASE_URL}/api/share`, { + method: "POST", + headers: { + "Content-Type": "application/json", + ...getAuthHeaders(), + }, + body: JSON.stringify({ + extraction_id: extractionId, + recipient_emails: emailsArray, + }), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Failed to share extraction"); + } + + return await response.json(); +} + +/** + * Access a shared extraction by token + * @param {string} token - Share token + * @returns {Promise} Share access result with extraction_id + */ +export async function accessSharedExtraction(token) { + const response = await fetch(`${API_BASE_URL}/api/share/${token}`, { + headers: getAuthHeaders(), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({ + error: `HTTP ${response.status}: ${response.statusText}`, + })); + throw new Error(errorData.error || errorData.detail || "Failed to access shared extraction"); + } + + return await response.json(); +} + +/** + * Health check endpoint + * @returns {Promise} Status object + */ +export async function ping() { + const response = await fetch(`${API_BASE_URL}/ping`); + if (!response.ok) { + throw new Error("Backend is not available"); + } + return await response.json(); +} + diff --git a/frontend/src/services/auth.js b/frontend/src/services/auth.js new file mode 100644 index 0000000000000000000000000000000000000000..12476865a058f6fa1bc5d291482e5c6a1cc6385b --- /dev/null +++ b/frontend/src/services/auth.js @@ -0,0 +1,111 @@ +/** + * Authentication service for Firebase and OTP authentication + */ + +const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || ""; + +/** + * Get the current authenticated user + * @returns {Promise} User object + */ +export async function getCurrentUser() { + const token = localStorage.getItem("auth_token"); + if (!token) { + throw new Error("No token found"); + } + + const response = await fetch(`${API_BASE_URL}/api/auth/me`, { + method: "GET", + headers: { + Authorization: `Bearer ${token}`, + }, + }); + + if (!response.ok) { + if (response.status === 401) { + localStorage.removeItem("auth_token"); + } + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "Failed to get user"); + } + + return await response.json(); +} + +/** + * Login with Firebase ID token + * @param {string} idToken - Firebase ID token + * @returns {Promise} Response with token and user + */ +export async function firebaseLogin(idToken) { + const response = await fetch(`${API_BASE_URL}/api/auth/firebase/login`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ id_token: idToken }), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "Firebase login failed"); + } + + return await response.json(); +} + +/** + * Request OTP for email login + * @param {string} email - Email address + * @returns {Promise} Response with success message + */ +export async function requestOTP(email) { + const response = await fetch(`${API_BASE_URL}/api/auth/otp/request`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ email }), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "Failed to send OTP"); + } + + return await response.json(); +} + +/** + * Verify OTP and login + * @param {string} email - Email address + * @param {string} otp - OTP code + * @returns {Promise} Response with token and user + */ +export async function verifyOTP(email, otp) { + const response = await fetch(`${API_BASE_URL}/api/auth/otp/verify`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ email, otp }), + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "OTP verification failed"); + } + + return await response.json(); +} + +/** + * Logout the current user + * @returns {Promise} + */ +export async function logout() { + // For JWT tokens, logout is handled client-side by removing the token + // No server-side logout needed + return Promise.resolve(); +} + diff --git a/frontend/src/utils.js b/frontend/src/utils.js index 1dbea0d1b5e7715720de23c39a03f3d70f125b03..1f1365752770f7639aa300a24473b69cd35c0d57 100644 --- a/frontend/src/utils.js +++ b/frontend/src/utils.js @@ -1,8 +1,8 @@ -// frontend/src/utils.jsx - -export function createPageUrl(pageName) { - if (!pageName) return "/"; - const lower = pageName.toLowerCase(); - if (lower === "dashboard") return "/"; - return `/${lower}`; -} +// frontend/src/utils.jsx + +export function createPageUrl(pageName) { + if (!pageName) return "/"; + const lower = pageName.toLowerCase(); + if (lower === "dashboard") return "/"; + return `/${lower}`; +} diff --git a/frontend/tailwind.config.cjs b/frontend/tailwind.config.cjs index c186f164ed7a996939aac68f23bfa2776121892a..f06286e9e4aa3311278e1d1a0624796d71618a75 100644 --- a/frontend/tailwind.config.cjs +++ b/frontend/tailwind.config.cjs @@ -1,10 +1,10 @@ -module.exports = { - content: [ - "./index.html", - "./src/**/*.{js,jsx}" - ], - theme: { - extend: {} - }, - plugins: [] -}; +module.exports = { + content: [ + "./index.html", + "./src/**/*.{js,jsx}" + ], + theme: { + extend: {} + }, + plugins: [] +}; diff --git a/frontend/vite.config.js b/frontend/vite.config.js index 67dc56d7d1c02b54a40bc17b035432dca4c0fbd9..0b2291845410516e108d554c3c81a5060ae2c65f 100644 --- a/frontend/vite.config.js +++ b/frontend/vite.config.js @@ -1,25 +1,25 @@ -import { defineConfig } from "vite"; -import react from "@vitejs/plugin-react"; -import path from "node:path"; - -export default defineConfig({ - plugins: [react()], - resolve: { - alias: { - "@": path.resolve(__dirname, "./src") - } - }, - build: { - outDir: "dist", // Dockerfile copies /frontend/dist → backend/frontend_dist - sourcemap: false, - rollupOptions: { - output: { - manualChunks: undefined - } - } - }, - base: "/", // Ensure assets are loaded from root - optimizeDeps: { - exclude: ["pdfjs-dist"] - } -}); +import { defineConfig } from "vite"; +import react from "@vitejs/plugin-react"; +import path from "node:path"; + +export default defineConfig({ + plugins: [react()], + resolve: { + alias: { + "@": path.resolve(__dirname, "./src") + } + }, + build: { + outDir: "dist", // Dockerfile copies /frontend/dist → backend/frontend_dist + sourcemap: false, + rollupOptions: { + output: { + manualChunks: undefined + } + } + }, + base: "/", // Ensure assets are loaded from root + optimizeDeps: { + exclude: ["pdfjs-dist"] + } +});