Spaces:
Running
Running
Happy People commited on
Commit ·
8b33e12
1
Parent(s): 7470785
Standalone worker: zero backend dependencies
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Dockerfile +13 -28
- backend/app/__init__.py +0 -98
- backend/app/api/admin.py +0 -433
- backend/app/api/auth.py +0 -221
- backend/app/api/endpoints.py +0 -742
- backend/app/core/__init__.py +0 -52
- backend/app/core/config.py +0 -59
- backend/app/core/database.py +0 -26
- backend/app/core/feature_registry.py +0 -255
- backend/app/core/migrations.py +0 -111
- backend/app/core/plan_config.py +0 -192
- backend/app/core/security.py +0 -28
- backend/app/core/stripe_config.py +0 -29
- backend/app/main.py +0 -124
- backend/app/models/feature_flags.py +0 -59
- backend/app/models/user.py +0 -63
- backend/app/schemas/chat.py +0 -14
- backend/app/schemas/financial.py +0 -47
- backend/app/schemas/user.py +0 -82
- backend/app/services/__init__.py +0 -37
- backend/app/services/analysis/__init__.py +0 -54
- backend/app/services/analysis/engine_lite.py +0 -48
- backend/app/services/analysis/factory.py +0 -18
- backend/app/services/analysis/fundamental.py +0 -75
- backend/app/services/analysis/growth.py +0 -26
- backend/app/services/analysis/health_score.py +0 -46
- backend/app/services/analysis/kpi.py +0 -56
- backend/app/services/analysis/risk.py +0 -57
- backend/app/services/analysis/simulation.py +0 -67
- backend/app/services/feature_service.py +0 -306
- backend/app/services/ingestion/__init__.py +0 -57
- backend/app/services/ingestion/dolphin/__init__.py +0 -158
- backend/app/services/ingestion/dolphin/classifier.py +0 -288
- backend/app/services/ingestion/dolphin/extractor.py +0 -336
- backend/app/services/ingestion/dolphin/remote_client.py +0 -110
- backend/app/services/ingestion/mappings.py +0 -315
- backend/app/services/ingestion/parser_csv.py +0 -127
- backend/app/services/ingestion/parser_dolphin.py +0 -429
- backend/app/services/ingestion/parser_pdf.py +0 -402
- backend/app/services/ingestion/parser_xlsx.py +0 -312
- backend/app/services/ingestion/unified_parser.py +0 -84
- backend/app/services/intelligence/ai_cfo.py +0 -52
- backend/app/services/intelligence/gemini_service.py +0 -238
- backend/app/services/intelligence/geo_service.py +0 -104
- backend/app/services/intelligence/rag.py +0 -35
- backend/app/services/reporting/pdf_report.py +0 -565
- backend/app/services/reporting/pptx_report.py +0 -57
- backend/requirements.txt +0 -29
- dolphin/__init__.py +37 -0
- {backend/app/services/ingestion/dolphin → dolphin}/client.py +41 -179
Dockerfile
CHANGED
|
@@ -1,38 +1,23 @@
|
|
| 1 |
-
FROM nvidia/cuda:11.8.0-
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
| 7 |
-
|
| 8 |
-
# Install system dependencies
|
| 9 |
-
RUN apt-get update && apt-get install -y \
|
| 10 |
-
python3.10 \
|
| 11 |
-
python3-pip \
|
| 12 |
-
python3-venv \
|
| 13 |
-
poppler-utils \
|
| 14 |
-
git \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
-
# Create user (
|
| 18 |
RUN useradd -m -u 1000 user
|
| 19 |
USER user
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
# We expect the `visique/backend` code to be copied into `backend/`
|
| 24 |
-
# and `visique/ai-worker` code to be in `.`
|
| 25 |
-
COPY --chown=user:user . .
|
| 26 |
|
| 27 |
-
# Install
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
|
| 34 |
-
# Expose port (HF Spaces defaults to 7860)
|
| 35 |
EXPOSE 7860
|
| 36 |
-
|
| 37 |
-
# CMD to copy backend lib and start app
|
| 38 |
-
CMD ["/bin/bash", "-c", "cp -r backend/app . && uvicorn main:app --host 0.0.0.0 --port 7860"]
|
|
|
|
| 1 |
+
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
|
| 2 |
|
| 3 |
+
# System deps
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
python3 python3-pip poppler-utils git \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
+
# Create non-root user (required by HF Spaces)
|
| 9 |
RUN useradd -m -u 1000 user
|
| 10 |
USER user
|
| 11 |
+
ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
|
| 12 |
|
| 13 |
+
WORKDIR /home/user/app
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# Install Python deps first (layer caching)
|
| 16 |
+
COPY --chown=user requirements.txt .
|
| 17 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 18 |
|
| 19 |
+
# Copy application code
|
| 20 |
+
COPY --chown=user . .
|
| 21 |
|
|
|
|
| 22 |
EXPOSE 7860
|
| 23 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
backend/app/__init__.py
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Visique Backend Application
|
| 3 |
-
|
| 4 |
-
This package contains the backend API and services for the Visique financial analysis platform.
|
| 5 |
-
|
| 6 |
-
## Architecture Overview
|
| 7 |
-
|
| 8 |
-
```
|
| 9 |
-
app/
|
| 10 |
-
├── api/ # FastAPI route handlers
|
| 11 |
-
│ ├── admin.py # Admin console endpoints (users, reports, features)
|
| 12 |
-
│ ├── auth.py # Authentication (login, register, JWT)
|
| 13 |
-
│ └── endpoints.py # Analysis endpoints (upload, simulate, report)
|
| 14 |
-
│
|
| 15 |
-
├── core/ # Core configuration and utilities
|
| 16 |
-
│ ├── config.py # Environment settings (API keys, URLs)
|
| 17 |
-
│ ├── database.py # SQLAlchemy database connection
|
| 18 |
-
│ ├── security.py # JWT token creation/validation
|
| 19 |
-
│ ├── feature_registry.py # Auto-discoverable feature definitions
|
| 20 |
-
│ └── plan_config.py # Plan limits and default features
|
| 21 |
-
│
|
| 22 |
-
├── models/ # SQLAlchemy database models
|
| 23 |
-
│ ├── user.py # User, Analysis, Payment models
|
| 24 |
-
│ └── feature_flags.py # PlanFeatureOverride, PlanUploadLimit
|
| 25 |
-
│
|
| 26 |
-
├── schemas/ # Pydantic request/response schemas
|
| 27 |
-
│ ├── user.py # UserCreate, UserResponse, etc.
|
| 28 |
-
│ ├── financial.py # StandardizedDataPackage, KPIs, etc.
|
| 29 |
-
│ └── chat.py # ChatRequest, ChatResponse
|
| 30 |
-
│
|
| 31 |
-
├── services/ # Business logic layer
|
| 32 |
-
│ ├── feature_service.py # Feature flag resolution logic
|
| 33 |
-
│ ├── analysis/ # Financial analysis modules
|
| 34 |
-
│ │ ├── fundamental.py # Main analysis orchestrator
|
| 35 |
-
│ │ ├── kpi.py # KPI calculations
|
| 36 |
-
│ │ ├── risk.py # Risk analysis
|
| 37 |
-
│ │ ├── health_score.py # Health score computation
|
| 38 |
-
│ │ ├── growth.py # Growth metrics
|
| 39 |
-
│ │ └── simulation.py # What-if scenario modeling
|
| 40 |
-
│ ├── ingestion/ # Data parsing
|
| 41 |
-
│ │ ├── parser_csv.py # CSV file parsing
|
| 42 |
-
│ │ ├── parser_pdf.py # PDF extraction + OCR
|
| 43 |
-
│ │ └── mappings.py # Field name normalization
|
| 44 |
-
│ ├── intelligence/ # AI-powered features
|
| 45 |
-
│ │ ├── gemini_service.py # Gemini API integration
|
| 46 |
-
│ │ ├── ai_cfo.py # AI CFO chat functionality
|
| 47 |
-
│ │ ├── geo_service.py # Geo-strategic analysis
|
| 48 |
-
│ │ └── rag.py # RAG for document QA
|
| 49 |
-
│ └── reporting/ # Report generation
|
| 50 |
-
│ ├── pdf_report.py # PDF report builder
|
| 51 |
-
│ └── pptx_report.py # PowerPoint builder
|
| 52 |
-
│
|
| 53 |
-
└── main.py # FastAPI app initialization
|
| 54 |
-
```
|
| 55 |
-
|
| 56 |
-
## Module Responsibilities
|
| 57 |
-
|
| 58 |
-
### API Layer (`api/`)
|
| 59 |
-
- HTTP request handling only
|
| 60 |
-
- Input validation via Pydantic
|
| 61 |
-
- Delegates all logic to services
|
| 62 |
-
- Returns standardized responses
|
| 63 |
-
|
| 64 |
-
### Core Layer (`core/`)
|
| 65 |
-
- Application-wide configuration
|
| 66 |
-
- Feature registry (add new features here)
|
| 67 |
-
- Plan configuration (modify limits here)
|
| 68 |
-
- Security utilities (JWT)
|
| 69 |
-
|
| 70 |
-
### Models Layer (`models/`)
|
| 71 |
-
- Database schema definitions
|
| 72 |
-
- Relationships between entities
|
| 73 |
-
- No business logic
|
| 74 |
-
|
| 75 |
-
### Schemas Layer (`schemas/`)
|
| 76 |
-
- Request/response validation
|
| 77 |
-
- Data transformation for API
|
| 78 |
-
- Type hints for IDE support
|
| 79 |
-
|
| 80 |
-
### Services Layer (`services/`)
|
| 81 |
-
- All business logic lives here
|
| 82 |
-
- Each subdirectory is a domain
|
| 83 |
-
- Services are stateless and testable
|
| 84 |
-
|
| 85 |
-
## Adding New Features
|
| 86 |
-
|
| 87 |
-
1. **New Feature Flag**: Add to `core/feature_registry.py`
|
| 88 |
-
2. **New API Endpoint**: Add to appropriate `api/*.py`
|
| 89 |
-
3. **New Service Logic**: Create in `services/` subdirectory
|
| 90 |
-
4. **New Model Field**: Add to `models/` and run migration
|
| 91 |
-
|
| 92 |
-
## Key Design Patterns
|
| 93 |
-
|
| 94 |
-
- **Repository Pattern**: Services interact with DB via session
|
| 95 |
-
- **Dependency Injection**: FastAPI `Depends()` for DB/auth
|
| 96 |
-
- **Single Responsibility**: Each module has one clear purpose
|
| 97 |
-
- **Feature Registry**: Auto-discoverable, category-organized
|
| 98 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/admin.py
DELETED
|
@@ -1,433 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, Depends, HTTPException, status
|
| 2 |
-
from sqlalchemy.orm import Session
|
| 3 |
-
from typing import List, Optional
|
| 4 |
-
from app.core.database import get_db
|
| 5 |
-
from app.models.user import User, Payment, Analysis
|
| 6 |
-
from app.schemas.user import UserResponse, PaymentResponse
|
| 7 |
-
from app.api.auth import get_current_user
|
| 8 |
-
import os
|
| 9 |
-
|
| 10 |
-
router = APIRouter(prefix="/admin", tags=["admin"])
|
| 11 |
-
|
| 12 |
-
def get_current_admin(current_user: User = Depends(get_current_user)):
|
| 13 |
-
if not current_user.is_admin:
|
| 14 |
-
raise HTTPException(
|
| 15 |
-
status_code=status.HTTP_403_FORBIDDEN,
|
| 16 |
-
detail="The user doesn't have enough privileges",
|
| 17 |
-
)
|
| 18 |
-
return current_user
|
| 19 |
-
|
| 20 |
-
@router.get("/payments", response_model=List[PaymentResponse])
|
| 21 |
-
def read_all_payments(
|
| 22 |
-
skip: int = 0,
|
| 23 |
-
limit: int = 100,
|
| 24 |
-
db: Session = Depends(get_db),
|
| 25 |
-
current_user: User = Depends(get_current_admin)
|
| 26 |
-
):
|
| 27 |
-
payments = db.query(Payment).offset(skip).limit(limit).all()
|
| 28 |
-
return payments
|
| 29 |
-
|
| 30 |
-
@router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
|
| 31 |
-
def delete_user(
|
| 32 |
-
user_id: int,
|
| 33 |
-
db: Session = Depends(get_db),
|
| 34 |
-
current_user: User = Depends(get_current_admin)
|
| 35 |
-
):
|
| 36 |
-
user = db.query(User).filter(User.id == user_id).first()
|
| 37 |
-
if not user:
|
| 38 |
-
raise HTTPException(status_code=404, detail="User not found")
|
| 39 |
-
|
| 40 |
-
if user.id == current_user.id:
|
| 41 |
-
raise HTTPException(status_code=400, detail="Cannot delete your own admin account")
|
| 42 |
-
|
| 43 |
-
db.delete(user)
|
| 44 |
-
db.commit()
|
| 45 |
-
return None
|
| 46 |
-
|
| 47 |
-
from pydantic import BaseModel
|
| 48 |
-
class AdminUserUpdate(BaseModel):
|
| 49 |
-
full_name: Optional[str] = None
|
| 50 |
-
company_name: Optional[str] = None
|
| 51 |
-
plan: Optional[str] = None
|
| 52 |
-
is_admin: Optional[bool] = None
|
| 53 |
-
is_super_admin: Optional[bool] = None
|
| 54 |
-
visique_id: Optional[str] = None
|
| 55 |
-
ein: Optional[str] = None
|
| 56 |
-
address: Optional[str] = None
|
| 57 |
-
industry: Optional[str] = None
|
| 58 |
-
|
| 59 |
-
class FeatureToggleRequest(BaseModel):
|
| 60 |
-
feature_states: dict # {feature_id: bool}
|
| 61 |
-
|
| 62 |
-
@router.put("/users/{user_id}", response_model=UserResponse)
|
| 63 |
-
def update_user_admin(
|
| 64 |
-
user_id: int,
|
| 65 |
-
user_update: AdminUserUpdate,
|
| 66 |
-
db: Session = Depends(get_db),
|
| 67 |
-
current_user: User = Depends(get_current_admin)
|
| 68 |
-
):
|
| 69 |
-
user = db.query(User).filter(User.id == user_id).first()
|
| 70 |
-
if not user:
|
| 71 |
-
raise HTTPException(status_code=404, detail="User not found")
|
| 72 |
-
|
| 73 |
-
# Check if target is admin and requester is not super admin
|
| 74 |
-
if user.is_admin and not current_user.is_super_admin:
|
| 75 |
-
raise HTTPException(
|
| 76 |
-
status_code=403,
|
| 77 |
-
detail="Only Special Admins can edit Admin profiles"
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
update_data = user_update.dict(exclude_unset=True)
|
| 81 |
-
for key, value in update_data.items():
|
| 82 |
-
# Only super admins can change is_super_admin status
|
| 83 |
-
if key == "is_super_admin" and not current_user.is_super_admin:
|
| 84 |
-
continue
|
| 85 |
-
setattr(user, key, value)
|
| 86 |
-
|
| 87 |
-
db.commit()
|
| 88 |
-
db.refresh(user)
|
| 89 |
-
return user
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
@router.put("/users/{user_id}/features")
|
| 93 |
-
def update_user_features(
|
| 94 |
-
user_id: int,
|
| 95 |
-
request: FeatureToggleRequest,
|
| 96 |
-
db: Session = Depends(get_db),
|
| 97 |
-
current_user: User = Depends(get_current_admin)
|
| 98 |
-
):
|
| 99 |
-
"""
|
| 100 |
-
Update custom feature overrides for a specific user.
|
| 101 |
-
"""
|
| 102 |
-
user = db.query(User).filter(User.id == user_id).first()
|
| 103 |
-
if not user:
|
| 104 |
-
raise HTTPException(status_code=404, detail="User not found")
|
| 105 |
-
|
| 106 |
-
# Get current and merge
|
| 107 |
-
current_features = user.custom_features or {}
|
| 108 |
-
|
| 109 |
-
# Handle SQLite parsing if needed
|
| 110 |
-
if isinstance(current_features, str):
|
| 111 |
-
import json
|
| 112 |
-
try:
|
| 113 |
-
current_features = json.loads(current_features)
|
| 114 |
-
except:
|
| 115 |
-
current_features = {}
|
| 116 |
-
|
| 117 |
-
# Ensure it's a dict copy to trigger mutation detection
|
| 118 |
-
new_features = dict(current_features)
|
| 119 |
-
|
| 120 |
-
for k, v in request.feature_states.items():
|
| 121 |
-
new_features[k] = v
|
| 122 |
-
|
| 123 |
-
user.custom_features = new_features
|
| 124 |
-
|
| 125 |
-
from sqlalchemy.orm.attributes import flag_modified
|
| 126 |
-
flag_modified(user, "custom_features")
|
| 127 |
-
|
| 128 |
-
db.commit()
|
| 129 |
-
return {
|
| 130 |
-
"status": "success",
|
| 131 |
-
"user_id": user.id,
|
| 132 |
-
"custom_features": user.custom_features
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
class EngineUpdateRequest(BaseModel):
|
| 137 |
-
engine: str
|
| 138 |
-
|
| 139 |
-
@router.put("/users/{user_id}/engine")
|
| 140 |
-
def update_user_engine(
|
| 141 |
-
user_id: int,
|
| 142 |
-
request: EngineUpdateRequest,
|
| 143 |
-
db: Session = Depends(get_db),
|
| 144 |
-
current_user: User = Depends(get_current_admin)
|
| 145 |
-
):
|
| 146 |
-
"""
|
| 147 |
-
Update a user's preferred engine (v1 or v2).
|
| 148 |
-
"""
|
| 149 |
-
user = db.query(User).filter(User.id == user_id).first()
|
| 150 |
-
if not user:
|
| 151 |
-
raise HTTPException(status_code=404, detail="User not found")
|
| 152 |
-
|
| 153 |
-
if request.engine not in ["v1", "v2"]:
|
| 154 |
-
raise HTTPException(status_code=400, detail="Invalid engine. Must be 'v1' or 'v2'")
|
| 155 |
-
|
| 156 |
-
user.preferred_engine = request.engine
|
| 157 |
-
db.commit()
|
| 158 |
-
db.refresh(user)
|
| 159 |
-
|
| 160 |
-
return {"status": "success", "user_id": user.id, "preferred_engine": user.preferred_engine}
|
| 161 |
-
|
| 162 |
-
@router.get("/users", response_model=List[UserResponse])
|
| 163 |
-
def read_all_users(
|
| 164 |
-
skip: int = 0,
|
| 165 |
-
limit: int = 100,
|
| 166 |
-
search: Optional[str] = None,
|
| 167 |
-
db: Session = Depends(get_db),
|
| 168 |
-
current_user: User = Depends(get_current_admin)
|
| 169 |
-
):
|
| 170 |
-
query = db.query(User)
|
| 171 |
-
if search:
|
| 172 |
-
# Search by Visique ID (exact or partial) or Email or Name
|
| 173 |
-
search_filter = f"%{search}%"
|
| 174 |
-
query = query.filter(
|
| 175 |
-
(User.email.ilike(search_filter)) |
|
| 176 |
-
(User.full_name.ilike(search_filter)) |
|
| 177 |
-
(User.visique_id.ilike(search_filter))
|
| 178 |
-
)
|
| 179 |
-
return query.offset(skip).limit(limit).all()
|
| 180 |
-
|
| 181 |
-
@router.get("/analyses")
|
| 182 |
-
def read_all_analyses(
|
| 183 |
-
skip: int = 0,
|
| 184 |
-
limit: int = 100,
|
| 185 |
-
db: Session = Depends(get_db),
|
| 186 |
-
current_user: User = Depends(get_current_admin)
|
| 187 |
-
):
|
| 188 |
-
"""
|
| 189 |
-
Get all analyses from all users.
|
| 190 |
-
Returns a simplified list for the admin dashboard.
|
| 191 |
-
"""
|
| 192 |
-
# Join with User to get owner details
|
| 193 |
-
analyses = db.query(Analysis).join(User).order_by(Analysis.timestamp.desc()).offset(skip).limit(limit).all()
|
| 194 |
-
|
| 195 |
-
result = []
|
| 196 |
-
for a in analyses:
|
| 197 |
-
result.append({
|
| 198 |
-
"id": a.id,
|
| 199 |
-
"company_name": a.company_name,
|
| 200 |
-
"filename": a.input_filename,
|
| 201 |
-
"timestamp": a.timestamp,
|
| 202 |
-
"owner_email": a.owner.email,
|
| 203 |
-
"owner_visique_id": a.owner.visique_id
|
| 204 |
-
})
|
| 205 |
-
return result
|
| 206 |
-
|
| 207 |
-
@router.delete("/analyses/{analysis_id}", status_code=status.HTTP_204_NO_CONTENT)
|
| 208 |
-
def delete_analysis_admin(
|
| 209 |
-
analysis_id: int,
|
| 210 |
-
db: Session = Depends(get_db),
|
| 211 |
-
current_user: User = Depends(get_current_admin)
|
| 212 |
-
):
|
| 213 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
|
| 214 |
-
if not analysis:
|
| 215 |
-
raise HTTPException(status_code=404, detail="Analysis not found")
|
| 216 |
-
|
| 217 |
-
# Delete file from disk
|
| 218 |
-
if analysis.stored_filename and os.path.exists(analysis.stored_filename):
|
| 219 |
-
try:
|
| 220 |
-
os.remove(analysis.stored_filename)
|
| 221 |
-
except OSError:
|
| 222 |
-
pass # Continue even if file delete fails
|
| 223 |
-
|
| 224 |
-
db.delete(analysis)
|
| 225 |
-
db.commit()
|
| 226 |
-
return None
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
# =============================================================================
|
| 230 |
-
# USAGE TRACKING ENDPOINTS
|
| 231 |
-
# =============================================================================
|
| 232 |
-
|
| 233 |
-
@router.get("/usage")
|
| 234 |
-
def get_usage_stats(
|
| 235 |
-
db: Session = Depends(get_db),
|
| 236 |
-
current_user: User = Depends(get_current_admin)
|
| 237 |
-
):
|
| 238 |
-
"""
|
| 239 |
-
Get upload usage statistics for all users.
|
| 240 |
-
Shows uploads used, limit, and percentage for admin dashboard.
|
| 241 |
-
"""
|
| 242 |
-
from app.services.feature_service import get_effective_upload_limit
|
| 243 |
-
|
| 244 |
-
users = db.query(User).all()
|
| 245 |
-
result = []
|
| 246 |
-
|
| 247 |
-
for user in users:
|
| 248 |
-
plan = user.plan or "Individual"
|
| 249 |
-
if user.is_admin:
|
| 250 |
-
plan = "Admin"
|
| 251 |
-
|
| 252 |
-
limit = get_effective_upload_limit(db, plan)
|
| 253 |
-
used = user.monthly_upload_count or 0
|
| 254 |
-
percentage = round((used / limit * 100), 1) if limit > 0 else 0
|
| 255 |
-
|
| 256 |
-
result.append({
|
| 257 |
-
"id": user.id,
|
| 258 |
-
"email": user.email,
|
| 259 |
-
"full_name": user.full_name,
|
| 260 |
-
"visique_id": user.visique_id,
|
| 261 |
-
"plan": plan,
|
| 262 |
-
"uploads_used": used,
|
| 263 |
-
"uploads_limit": limit,
|
| 264 |
-
"usage_percentage": percentage,
|
| 265 |
-
"reset_date": user.upload_reset_date.isoformat() if user.upload_reset_date else None
|
| 266 |
-
})
|
| 267 |
-
|
| 268 |
-
# Sort by usage percentage descending
|
| 269 |
-
result.sort(key=lambda x: x["usage_percentage"], reverse=True)
|
| 270 |
-
return result
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
# =============================================================================
|
| 274 |
-
# FEATURE FLAG ENDPOINTS
|
| 275 |
-
# =============================================================================
|
| 276 |
-
|
| 277 |
-
@router.get("/features")
|
| 278 |
-
def get_feature_matrix(
|
| 279 |
-
db: Session = Depends(get_db),
|
| 280 |
-
current_user: User = Depends(get_current_admin)
|
| 281 |
-
):
|
| 282 |
-
"""
|
| 283 |
-
Get the full feature matrix for admin console.
|
| 284 |
-
Shows all features grouped by category with per-plan toggles.
|
| 285 |
-
"""
|
| 286 |
-
from app.services.feature_service import get_feature_matrix as get_matrix
|
| 287 |
-
return get_matrix(db)
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
@router.get("/features/registry")
|
| 291 |
-
def get_feature_registry(
|
| 292 |
-
current_user: User = Depends(get_current_admin)
|
| 293 |
-
):
|
| 294 |
-
"""
|
| 295 |
-
Get the feature registry - all available features.
|
| 296 |
-
Useful for understanding what features can be controlled.
|
| 297 |
-
"""
|
| 298 |
-
from app.core.feature_registry import get_features_by_category, get_all_feature_ids
|
| 299 |
-
|
| 300 |
-
categories = get_features_by_category()
|
| 301 |
-
result = {}
|
| 302 |
-
|
| 303 |
-
for cat_name, features in categories.items():
|
| 304 |
-
result[cat_name] = [
|
| 305 |
-
{
|
| 306 |
-
"id": f.id,
|
| 307 |
-
"name": f.name,
|
| 308 |
-
"description": f.description,
|
| 309 |
-
"default_enabled": f.default_enabled
|
| 310 |
-
}
|
| 311 |
-
for f in features
|
| 312 |
-
]
|
| 313 |
-
|
| 314 |
-
return {
|
| 315 |
-
"total_features": len(get_all_feature_ids()),
|
| 316 |
-
"categories": result
|
| 317 |
-
}
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
@router.get("/features/{plan_name}")
|
| 321 |
-
def get_plan_features(
|
| 322 |
-
plan_name: str,
|
| 323 |
-
db: Session = Depends(get_db),
|
| 324 |
-
current_user: User = Depends(get_current_admin)
|
| 325 |
-
):
|
| 326 |
-
"""
|
| 327 |
-
Get enabled features for a specific plan.
|
| 328 |
-
"""
|
| 329 |
-
from app.services.feature_service import get_effective_features, get_effective_upload_limit
|
| 330 |
-
from app.core.plan_config import get_all_plans, get_all_engines
|
| 331 |
-
|
| 332 |
-
if plan_name not in get_all_plans() and plan_name not in get_all_engines():
|
| 333 |
-
raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
|
| 334 |
-
|
| 335 |
-
return {
|
| 336 |
-
"plan": plan_name,
|
| 337 |
-
"upload_limit": get_effective_upload_limit(db, plan_name),
|
| 338 |
-
"enabled_features": get_effective_features(db, plan_name)
|
| 339 |
-
}
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
@router.put("/features/{plan_name}")
|
| 343 |
-
def update_plan_features(
|
| 344 |
-
plan_name: str,
|
| 345 |
-
request: FeatureToggleRequest,
|
| 346 |
-
db: Session = Depends(get_db),
|
| 347 |
-
current_user: User = Depends(get_current_admin)
|
| 348 |
-
):
|
| 349 |
-
"""
|
| 350 |
-
Bulk update features for a plan.
|
| 351 |
-
"""
|
| 352 |
-
from app.services.feature_service import bulk_set_features
|
| 353 |
-
from app.core.plan_config import get_all_plans, get_all_engines
|
| 354 |
-
|
| 355 |
-
if plan_name not in get_all_plans() and plan_name not in get_all_engines():
|
| 356 |
-
raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
|
| 357 |
-
|
| 358 |
-
count = bulk_set_features(db, plan_name, request.feature_states, current_user.id)
|
| 359 |
-
|
| 360 |
-
return {
|
| 361 |
-
"message": f"Updated {count} features for {plan_name}",
|
| 362 |
-
"plan": plan_name,
|
| 363 |
-
"updated_count": count
|
| 364 |
-
}
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
@router.post("/features/{plan_name}/reset")
|
| 368 |
-
def reset_plan_features(
|
| 369 |
-
plan_name: str,
|
| 370 |
-
db: Session = Depends(get_db),
|
| 371 |
-
current_user: User = Depends(get_current_admin)
|
| 372 |
-
):
|
| 373 |
-
"""
|
| 374 |
-
Reset a plan's features to defaults (removes all overrides).
|
| 375 |
-
"""
|
| 376 |
-
from app.services.feature_service import reset_plan_to_defaults
|
| 377 |
-
from app.core.plan_config import get_all_plans, get_all_engines
|
| 378 |
-
|
| 379 |
-
if plan_name not in get_all_plans() and plan_name not in get_all_engines():
|
| 380 |
-
raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
|
| 381 |
-
|
| 382 |
-
count = reset_plan_to_defaults(db, plan_name)
|
| 383 |
-
|
| 384 |
-
return {
|
| 385 |
-
"message": f"Reset {plan_name} to defaults, removed {count} overrides",
|
| 386 |
-
"plan": plan_name,
|
| 387 |
-
"removed_overrides": count
|
| 388 |
-
}
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
class UploadLimitRequest(BaseModel):
|
| 392 |
-
upload_limit: int
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
@router.put("/features/{plan_name}/limit")
|
| 396 |
-
def update_plan_upload_limit(
|
| 397 |
-
plan_name: str,
|
| 398 |
-
request: UploadLimitRequest,
|
| 399 |
-
db: Session = Depends(get_db),
|
| 400 |
-
current_user: User = Depends(get_current_admin)
|
| 401 |
-
):
|
| 402 |
-
"""
|
| 403 |
-
Update upload limit for a plan.
|
| 404 |
-
"""
|
| 405 |
-
from app.models.feature_flags import PlanUploadLimit
|
| 406 |
-
from app.core.plan_config import get_all_plans
|
| 407 |
-
|
| 408 |
-
if plan_name not in get_all_plans():
|
| 409 |
-
raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
|
| 410 |
-
|
| 411 |
-
# Find or create limit override
|
| 412 |
-
override = db.query(PlanUploadLimit).filter(
|
| 413 |
-
PlanUploadLimit.plan_name == plan_name
|
| 414 |
-
).first()
|
| 415 |
-
|
| 416 |
-
if override:
|
| 417 |
-
override.upload_limit = request.upload_limit
|
| 418 |
-
override.updated_by_id = current_user.id
|
| 419 |
-
else:
|
| 420 |
-
override = PlanUploadLimit(
|
| 421 |
-
plan_name=plan_name,
|
| 422 |
-
upload_limit=request.upload_limit,
|
| 423 |
-
updated_by_id=current_user.id
|
| 424 |
-
)
|
| 425 |
-
db.add(override)
|
| 426 |
-
|
| 427 |
-
db.commit()
|
| 428 |
-
|
| 429 |
-
return {
|
| 430 |
-
"message": f"Updated upload limit for {plan_name}",
|
| 431 |
-
"plan": plan_name,
|
| 432 |
-
"new_limit": request.upload_limit
|
| 433 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/auth.py
DELETED
|
@@ -1,221 +0,0 @@
|
|
| 1 |
-
from datetime import datetime, timedelta
|
| 2 |
-
from typing import Optional
|
| 3 |
-
from fastapi import APIRouter, Depends, HTTPException, status
|
| 4 |
-
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
|
| 5 |
-
from jose import JWTError, jwt
|
| 6 |
-
from passlib.context import CryptContext
|
| 7 |
-
from sqlalchemy.orm import Session
|
| 8 |
-
from app.core.database import get_db
|
| 9 |
-
from app.models.user import User
|
| 10 |
-
from app.schemas.user import UserCreate, UserResponse, Token, UpgradeRequest
|
| 11 |
-
from app.core.security import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
|
| 12 |
-
|
| 13 |
-
from app.core.security import verify_password, get_password_hash, create_access_token, ALGORITHM, SECRET_KEY, ACCESS_TOKEN_EXPIRE_MINUTES
|
| 14 |
-
|
| 15 |
-
router = APIRouter(prefix="/auth", tags=["auth"])
|
| 16 |
-
|
| 17 |
-
@router.get("/probe")
|
| 18 |
-
def probe():
|
| 19 |
-
return {"status": "auth_router_working"}
|
| 20 |
-
|
| 21 |
-
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
|
| 22 |
-
|
| 23 |
-
@router.post("/register", response_model=UserResponse)
|
| 24 |
-
def register(user: UserCreate, db: Session = Depends(get_db)):
|
| 25 |
-
try:
|
| 26 |
-
db_user = db.query(User).filter(User.email == user.email).first()
|
| 27 |
-
if db_user:
|
| 28 |
-
raise HTTPException(status_code=400, detail="Email already registered")
|
| 29 |
-
hashed_password = get_password_hash(user.password)
|
| 30 |
-
|
| 31 |
-
# Valid Admin Keys
|
| 32 |
-
VALID_ADMIN_KEYS = [
|
| 33 |
-
"VSQADM001", "VSQADM002", "VSQADM003",
|
| 34 |
-
"VSQADM004", "VSQADM005", "VSQADM006"
|
| 35 |
-
]
|
| 36 |
-
|
| 37 |
-
# Check Admin Key
|
| 38 |
-
is_admin = False
|
| 39 |
-
is_super_admin = False
|
| 40 |
-
SUPER_ADMIN_KEYS = ["VSQADM003", "VSQADM006"]
|
| 41 |
-
|
| 42 |
-
if user.admin_key and user.admin_key in VALID_ADMIN_KEYS:
|
| 43 |
-
is_admin = True
|
| 44 |
-
if user.admin_key in SUPER_ADMIN_KEYS:
|
| 45 |
-
is_super_admin = True
|
| 46 |
-
|
| 47 |
-
# Generate Visique ID
|
| 48 |
-
# Generate Visique ID
|
| 49 |
-
import uuid
|
| 50 |
-
import random
|
| 51 |
-
if is_admin:
|
| 52 |
-
# VISI-###### (6 digits)
|
| 53 |
-
digits = ''.join([str(random.randint(0, 9)) for _ in range(6)])
|
| 54 |
-
visique_id = f"VISI-{digits}"
|
| 55 |
-
else:
|
| 56 |
-
visique_id = f"VSQ-{str(uuid.uuid4())[:8].upper()}"
|
| 57 |
-
|
| 58 |
-
new_user = User(
|
| 59 |
-
email=user.email,
|
| 60 |
-
hashed_password=hashed_password,
|
| 61 |
-
full_name=user.full_name,
|
| 62 |
-
company_name=user.company_name,
|
| 63 |
-
is_admin=is_admin,
|
| 64 |
-
is_super_admin=is_super_admin,
|
| 65 |
-
visique_id=visique_id
|
| 66 |
-
)
|
| 67 |
-
db.add(new_user)
|
| 68 |
-
db.commit()
|
| 69 |
-
db.refresh(new_user)
|
| 70 |
-
return new_user
|
| 71 |
-
except HTTPException as he:
|
| 72 |
-
raise he
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f"Registration Error: {str(e)}")
|
| 75 |
-
raise HTTPException(status_code=500, detail=f"Registration failed: {str(e)}")
|
| 76 |
-
|
| 77 |
-
@router.post("/login", response_model=Token)
|
| 78 |
-
def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
|
| 79 |
-
user = db.query(User).filter(User.email == form_data.username).first()
|
| 80 |
-
if not user or not verify_password(form_data.password, user.hashed_password):
|
| 81 |
-
raise HTTPException(
|
| 82 |
-
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 83 |
-
detail="Incorrect username or password",
|
| 84 |
-
headers={"WWW-Authenticate": "Bearer"},
|
| 85 |
-
)
|
| 86 |
-
access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 87 |
-
access_token = create_access_token(
|
| 88 |
-
data={"sub": user.email}, expires_delta=access_token_expires
|
| 89 |
-
)
|
| 90 |
-
return {"access_token": access_token, "token_type": "bearer"}
|
| 91 |
-
|
| 92 |
-
async def get_current_user(
|
| 93 |
-
token: Optional[str] = Depends(oauth2_scheme),
|
| 94 |
-
db: Session = Depends(get_db),
|
| 95 |
-
query_token: Optional[str] = None
|
| 96 |
-
):
|
| 97 |
-
actual_token = token or query_token
|
| 98 |
-
|
| 99 |
-
credentials_exception = HTTPException(
|
| 100 |
-
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 101 |
-
detail="Could not validate credentials",
|
| 102 |
-
headers={"WWW-Authenticate": "Bearer"},
|
| 103 |
-
)
|
| 104 |
-
if not actual_token:
|
| 105 |
-
raise credentials_exception
|
| 106 |
-
|
| 107 |
-
try:
|
| 108 |
-
payload = jwt.decode(actual_token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 109 |
-
email: str = payload.get("sub")
|
| 110 |
-
if email is None:
|
| 111 |
-
raise credentials_exception
|
| 112 |
-
except JWTError:
|
| 113 |
-
raise credentials_exception
|
| 114 |
-
user = db.query(User).filter(User.email == email).first()
|
| 115 |
-
if user is None:
|
| 116 |
-
raise credentials_exception
|
| 117 |
-
return user
|
| 118 |
-
|
| 119 |
-
@router.get("/me", response_model=UserResponse)
|
| 120 |
-
async def read_users_me(current_user: User = Depends(get_current_user)):
|
| 121 |
-
return current_user
|
| 122 |
-
|
| 123 |
-
from app.core.config import settings
|
| 124 |
-
from app.core.stripe_config import create_checkout_session
|
| 125 |
-
import stripe
|
| 126 |
-
from fastapi import Request
|
| 127 |
-
|
| 128 |
-
@router.post("/create-checkout-session")
|
| 129 |
-
def create_payment(
|
| 130 |
-
plan_id: str, # Pass the Stripe Price ID
|
| 131 |
-
current_user: User = Depends(get_current_user),
|
| 132 |
-
db: Session = Depends(get_db)
|
| 133 |
-
):
|
| 134 |
-
session = create_checkout_session(current_user, plan_id)
|
| 135 |
-
if not session:
|
| 136 |
-
raise HTTPException(status_code=400, detail="Error creating payment session")
|
| 137 |
-
return {"url": session.url}
|
| 138 |
-
|
| 139 |
-
@router.post("/webhook")
|
| 140 |
-
async def stripe_webhook(request: Request, db: Session = Depends(get_db)):
|
| 141 |
-
payload = await request.body()
|
| 142 |
-
sig_header = request.headers.get("stripe-signature")
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
event = stripe.Webhook.construct_event(
|
| 146 |
-
payload, sig_header, settings.STRIPE_WEBHOOK_SECRET
|
| 147 |
-
)
|
| 148 |
-
except ValueError as e:
|
| 149 |
-
raise HTTPException(status_code=400, detail="Invalid payload")
|
| 150 |
-
except stripe.error.SignatureVerificationError as e:
|
| 151 |
-
raise HTTPException(status_code=400, detail="Invalid signature")
|
| 152 |
-
|
| 153 |
-
if event["type"] == "checkout.session.completed":
|
| 154 |
-
session = event["data"]["object"]
|
| 155 |
-
|
| 156 |
-
# Retrieve user and update plan
|
| 157 |
-
# Note: metadata values are strings
|
| 158 |
-
user_id = session.get("client_reference_id")
|
| 159 |
-
if user_id:
|
| 160 |
-
user = db.query(User).filter(User.id == int(user_id)).first()
|
| 161 |
-
if user:
|
| 162 |
-
user.plan = "Business" # Or derive from session
|
| 163 |
-
user.plan_expires_at = datetime.utcnow() + timedelta(days=30)
|
| 164 |
-
|
| 165 |
-
# Record Payment
|
| 166 |
-
from app.models.user import Payment
|
| 167 |
-
new_payment = Payment(
|
| 168 |
-
user_id=user.id,
|
| 169 |
-
amount=session.get("amount_total", 0) / 100.0,
|
| 170 |
-
status="paid",
|
| 171 |
-
plan_name="Business",
|
| 172 |
-
date=datetime.utcnow()
|
| 173 |
-
)
|
| 174 |
-
db.add(new_payment)
|
| 175 |
-
db.commit()
|
| 176 |
-
|
| 177 |
-
return {"status": "success"}
|
| 178 |
-
|
| 179 |
-
from typing import List
|
| 180 |
-
from app.schemas.user import PaymentResponse
|
| 181 |
-
from app.models.user import Payment
|
| 182 |
-
|
| 183 |
-
@router.get("/payments/me", response_model=List[PaymentResponse])
|
| 184 |
-
def read_my_payments(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
| 185 |
-
return db.query(Payment).filter(Payment.user_id == current_user.id).all()
|
| 186 |
-
|
| 187 |
-
from fastapi import UploadFile, File
|
| 188 |
-
import shutil
|
| 189 |
-
import os
|
| 190 |
-
|
| 191 |
-
@router.post("/me/avatar")
|
| 192 |
-
async def upload_avatar(
|
| 193 |
-
file: UploadFile = File(...),
|
| 194 |
-
current_user: User = Depends(get_current_user),
|
| 195 |
-
db: Session = Depends(get_db)
|
| 196 |
-
):
|
| 197 |
-
# Determine file extension
|
| 198 |
-
ext = file.filename.split(".")[-1]
|
| 199 |
-
if ext.lower() not in ["jpg", "jpeg", "png", "webp"]:
|
| 200 |
-
raise HTTPException(status_code=400, detail="Invalid image format. Use JPG, PNG, or WebP.")
|
| 201 |
-
|
| 202 |
-
# Save file
|
| 203 |
-
filename = f"avatar_{current_user.id}_{file.filename}"
|
| 204 |
-
upload_dir = "uploads/avatars"
|
| 205 |
-
if not os.path.exists(upload_dir):
|
| 206 |
-
os.makedirs(upload_dir)
|
| 207 |
-
|
| 208 |
-
file_path = os.path.join(upload_dir, filename)
|
| 209 |
-
with open(file_path, "wb+") as buffer:
|
| 210 |
-
shutil.copyfileobj(file.file, buffer)
|
| 211 |
-
|
| 212 |
-
# Update User Profile
|
| 213 |
-
# Store relative path or full? Relative to allow frontend to fetch via static mount
|
| 214 |
-
# Assuming we mount /uploads as /static/uploads or similar
|
| 215 |
-
# For now, store relative path "uploads/avatars/..."
|
| 216 |
-
current_user.profile_picture_url = f"/api/v1/static/avatars/{filename}"
|
| 217 |
-
|
| 218 |
-
db.commit()
|
| 219 |
-
db.refresh(current_user)
|
| 220 |
-
|
| 221 |
-
return {"message": "Avatar updated", "url": current_user.profile_picture_url}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/api/endpoints.py
DELETED
|
@@ -1,742 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
-
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
|
| 3 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 4 |
-
from app.core.security import create_access_token
|
| 5 |
-
from typing import Annotated
|
| 6 |
-
from pydantic import BaseModel
|
| 7 |
-
from datetime import date
|
| 8 |
-
import os
|
| 9 |
-
from app.services.ingestion.parser_csv import CSVParser
|
| 10 |
-
from app.services.ingestion.parser_pdf import PDFParser
|
| 11 |
-
from app.services.analysis.kpi import KPIAnalyzer
|
| 12 |
-
from app.services.analysis.risk import RiskAnalyzer
|
| 13 |
-
from app.services.analysis.health_score import HealthScoreAnalyzer
|
| 14 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
| 15 |
-
from app.services.analysis.factory import AnalysisFactory
|
| 16 |
-
from app.services.analysis.growth import GrowthAnalyzer
|
| 17 |
-
from app.services.analysis.simulation import SimulationService
|
| 18 |
-
from app.services.reporting.pdf_report import PDFReporter
|
| 19 |
-
from app.services.reporting.pptx_report import PPTXReporter
|
| 20 |
-
from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
|
| 21 |
-
from app.schemas.chat import ChatRequest, ChatResponse
|
| 22 |
-
from app.api.auth import get_current_user
|
| 23 |
-
from app.models.user import User, Analysis
|
| 24 |
-
from app.core.database import get_db
|
| 25 |
-
from sqlalchemy.orm import Session
|
| 26 |
-
import json
|
| 27 |
-
from fastapi.responses import FileResponse
|
| 28 |
-
from app.services.feature_service import get_effective_features
|
| 29 |
-
|
| 30 |
-
router = APIRouter(prefix="/analysis", tags=["analysis"])
|
| 31 |
-
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
|
| 32 |
-
|
| 33 |
-
@router.post("/token")
|
| 34 |
-
async def login(form_data: Annotated[OAuth2PasswordRequestForm, Depends()]):
|
| 35 |
-
# Mock User DB (kept for legacy demo, but real auth is at /auth/login)
|
| 36 |
-
if form_data.username == "analyst" and form_data.password == "visique":
|
| 37 |
-
return {"access_token": create_access_token(data={"sub": form_data.username}), "token_type": "bearer"}
|
| 38 |
-
raise HTTPException(status_code=400, detail="Incorrect username or password")
|
| 39 |
-
|
| 40 |
-
# Admin Dependency
|
| 41 |
-
def get_current_admin(current_user: User = Depends(get_current_user)):
|
| 42 |
-
if not current_user.is_admin:
|
| 43 |
-
raise HTTPException(status_code=403, detail="Admin privileges required")
|
| 44 |
-
return current_user
|
| 45 |
-
if not current_user.is_admin:
|
| 46 |
-
raise HTTPException(status_code=403, detail="Admin privileges required")
|
| 47 |
-
return current_user
|
| 48 |
-
|
| 49 |
-
@router.get("/admin/users")
|
| 50 |
-
def get_all_users(
|
| 51 |
-
admin: User = Depends(get_current_admin),
|
| 52 |
-
db: Session = Depends(get_db)
|
| 53 |
-
):
|
| 54 |
-
users = db.query(User).all()
|
| 55 |
-
return [
|
| 56 |
-
{
|
| 57 |
-
"id": u.id,
|
| 58 |
-
"email": u.email,
|
| 59 |
-
"full_name": u.full_name,
|
| 60 |
-
"company_name": u.company_name,
|
| 61 |
-
"is_admin": u.is_admin,
|
| 62 |
-
"created_at": u.created_at,
|
| 63 |
-
"analysis_count": len(u.analyses),
|
| 64 |
-
"preferred_engine": getattr(u, "preferred_engine", "v1")
|
| 65 |
-
}
|
| 66 |
-
for u in users
|
| 67 |
-
]
|
| 68 |
-
|
| 69 |
-
@router.get("/admin/analyses")
|
| 70 |
-
def get_all_analyses(
|
| 71 |
-
admin: User = Depends(get_current_admin),
|
| 72 |
-
db: Session = Depends(get_db)
|
| 73 |
-
):
|
| 74 |
-
analyses = db.query(Analysis).order_by(Analysis.timestamp.desc()).all()
|
| 75 |
-
return [
|
| 76 |
-
{
|
| 77 |
-
"id": a.id,
|
| 78 |
-
"user_email": a.owner.email,
|
| 79 |
-
"user_company": a.owner.company_name,
|
| 80 |
-
"company_name": a.company_name,
|
| 81 |
-
"filename": a.input_filename,
|
| 82 |
-
"timestamp": a.timestamp,
|
| 83 |
-
}
|
| 84 |
-
for a in analyses
|
| 85 |
-
]
|
| 86 |
-
|
| 87 |
-
@router.get("/admin/analyses/{analysis_id}/download")
|
| 88 |
-
def admin_download_file(
|
| 89 |
-
analysis_id: int,
|
| 90 |
-
admin: User = Depends(get_current_admin),
|
| 91 |
-
db: Session = Depends(get_db)
|
| 92 |
-
):
|
| 93 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
|
| 94 |
-
if not analysis or not analysis.stored_filename:
|
| 95 |
-
raise HTTPException(status_code=404, detail="File not found")
|
| 96 |
-
|
| 97 |
-
if not os.path.exists(analysis.stored_filename):
|
| 98 |
-
raise HTTPException(status_code=404, detail="File missing from server storage")
|
| 99 |
-
|
| 100 |
-
return FileResponse(
|
| 101 |
-
path=analysis.stored_filename,
|
| 102 |
-
filename=f"ADMIN_EXPORT_{analysis.input_filename}",
|
| 103 |
-
media_type='application/octet-stream'
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
import json
|
| 108 |
-
|
| 109 |
-
# Admin Dependency
|
| 110 |
-
def get_current_admin(current_user: User = Depends(get_current_user)):
|
| 111 |
-
if not current_user.is_admin:
|
| 112 |
-
raise HTTPException(status_code=403, detail="Admin privileges required")
|
| 113 |
-
return current_user
|
| 114 |
-
|
| 115 |
-
@router.post("/upload/csv", response_model=StandardizedDataPackage)
|
| 116 |
-
async def analyze_csv(
|
| 117 |
-
file: UploadFile = File(...),
|
| 118 |
-
current_user: User = Depends(get_current_user),
|
| 119 |
-
db: Session = Depends(get_db)
|
| 120 |
-
):
|
| 121 |
-
# Check upload limit
|
| 122 |
-
from app.services.feature_service import check_upload_limit, increment_upload_count
|
| 123 |
-
limit_check = check_upload_limit(db, current_user)
|
| 124 |
-
if not limit_check["can_upload"]:
|
| 125 |
-
raise HTTPException(
|
| 126 |
-
status_code=403,
|
| 127 |
-
detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
if not file.filename.endswith('.csv'):
|
| 131 |
-
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .csv file.")
|
| 132 |
-
|
| 133 |
-
# Secure filename and path
|
| 134 |
-
import uuid
|
| 135 |
-
safe_filename = f"{uuid.uuid4()}_{file.filename}"
|
| 136 |
-
upload_dir = "uploads"
|
| 137 |
-
if not os.path.exists(upload_dir):
|
| 138 |
-
os.makedirs(upload_dir)
|
| 139 |
-
|
| 140 |
-
file_path = os.path.join(upload_dir, safe_filename)
|
| 141 |
-
|
| 142 |
-
try:
|
| 143 |
-
with open(file_path, "wb+") as file_object:
|
| 144 |
-
while content := await file.read(1024 * 1024): # Stream 1MB chunks
|
| 145 |
-
file_object.write(content)
|
| 146 |
-
|
| 147 |
-
report = CSVParser.parse(file_path)
|
| 148 |
-
|
| 149 |
-
# Run Unified Analysis (includes Phase 2 & 3 extensions)
|
| 150 |
-
# Select Engine based on User Preference
|
| 151 |
-
analyzer = AnalysisFactory.get_analyzer(current_user)
|
| 152 |
-
# Fetch enabled features for user's plan
|
| 153 |
-
enabled_features = get_effective_features(db, current_user.plan or "Free")
|
| 154 |
-
analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
|
| 155 |
-
|
| 156 |
-
# The analyze() method returns: kpis, health_score, risk_analysis, insights (industry), recommendations, variance, runway, optimization
|
| 157 |
-
|
| 158 |
-
# Combine industry insights + recommendations + manual pain points if needed
|
| 159 |
-
# Note: FundamentalAnalyzer.analyze now handles most of this, but 'pain points' logic is inside recommendations or separate?
|
| 160 |
-
|
| 161 |
-
# Combine text insights
|
| 162 |
-
# Include risk_factors (which contain "Pain Point:" entries) in the insights array
|
| 163 |
-
risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
|
| 164 |
-
all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
|
| 165 |
-
|
| 166 |
-
result_package = StandardizedDataPackage(
|
| 167 |
-
raw_data=report,
|
| 168 |
-
kpis=analysis_result["kpis"],
|
| 169 |
-
risk_analysis=analysis_result["risk_analysis"],
|
| 170 |
-
health_score=analysis_result["health_score"],
|
| 171 |
-
insights=all_insights,
|
| 172 |
-
runway_forecast=analysis_result["runway_forecast"],
|
| 173 |
-
optimization_insights=analysis_result["optimization_insights"],
|
| 174 |
-
geo_analysis=analysis_result.get("geo_analysis")
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
# Save to DB
|
| 178 |
-
db_analysis = Analysis(
|
| 179 |
-
user_id=current_user.id,
|
| 180 |
-
company_name=report.company_name,
|
| 181 |
-
input_filename=file.filename,
|
| 182 |
-
stored_filename=file_path,
|
| 183 |
-
result_json=result_package.json()
|
| 184 |
-
)
|
| 185 |
-
db.add(db_analysis)
|
| 186 |
-
db.commit()
|
| 187 |
-
db.refresh(db_analysis)
|
| 188 |
-
|
| 189 |
-
result_package.analysis_id = db_analysis.id
|
| 190 |
-
|
| 191 |
-
# Increment upload count AFTER successful save
|
| 192 |
-
increment_upload_count(db, current_user)
|
| 193 |
-
|
| 194 |
-
return result_package
|
| 195 |
-
|
| 196 |
-
except Exception as e:
|
| 197 |
-
# Cleanup if analysis fails, but keep if successful
|
| 198 |
-
if os.path.exists(file_path):
|
| 199 |
-
os.remove(file_path)
|
| 200 |
-
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 201 |
-
|
| 202 |
-
@router.post("/save")
|
| 203 |
-
async def save_analysis_result(
|
| 204 |
-
payload: dict,
|
| 205 |
-
current_user: User = Depends(get_current_user),
|
| 206 |
-
db: Session = Depends(get_db)
|
| 207 |
-
):
|
| 208 |
-
"""
|
| 209 |
-
Receives pre-computed analysis results from Vercel serverless functions
|
| 210 |
-
and persists them to the database. This endpoint does NOT run analysis -
|
| 211 |
-
it only handles authentication and database storage.
|
| 212 |
-
"""
|
| 213 |
-
try:
|
| 214 |
-
company_name = "Unknown"
|
| 215 |
-
raw_data = payload.get("raw_data", {})
|
| 216 |
-
if isinstance(raw_data, dict):
|
| 217 |
-
company_name = raw_data.get("company_name", "Unknown")
|
| 218 |
-
|
| 219 |
-
original_filename = payload.pop("original_filename", "uploaded_file")
|
| 220 |
-
|
| 221 |
-
db_analysis = Analysis(
|
| 222 |
-
user_id=current_user.id,
|
| 223 |
-
company_name=company_name,
|
| 224 |
-
input_filename=original_filename,
|
| 225 |
-
stored_filename="vercel_processed",
|
| 226 |
-
result_json=json.dumps(payload)
|
| 227 |
-
)
|
| 228 |
-
db.add(db_analysis)
|
| 229 |
-
db.commit()
|
| 230 |
-
db.refresh(db_analysis)
|
| 231 |
-
|
| 232 |
-
# Increment upload count
|
| 233 |
-
increment_upload_count(db, current_user)
|
| 234 |
-
|
| 235 |
-
return {"status": "saved", "analysis_id": db_analysis.id}
|
| 236 |
-
except Exception as e:
|
| 237 |
-
raise HTTPException(status_code=500, detail=f"Failed to save analysis: {str(e)}")
|
| 238 |
-
|
| 239 |
-
@router.get("/history")
|
| 240 |
-
def get_history(
|
| 241 |
-
current_user: User = Depends(get_current_user),
|
| 242 |
-
db: Session = Depends(get_db)
|
| 243 |
-
):
|
| 244 |
-
analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).order_by(Analysis.timestamp.desc()).all()
|
| 245 |
-
return [
|
| 246 |
-
{
|
| 247 |
-
"id": a.id,
|
| 248 |
-
"company_name": a.company_name,
|
| 249 |
-
"filename": a.input_filename,
|
| 250 |
-
"timestamp": a.timestamp,
|
| 251 |
-
# We don't return full JSON here to keep it light, create separate endpoint for details if needed
|
| 252 |
-
}
|
| 253 |
-
for a in analyses
|
| 254 |
-
]
|
| 255 |
-
|
| 256 |
-
@router.get("/history/{analysis_id}", response_model=StandardizedDataPackage)
|
| 257 |
-
def get_analysis_detail(
|
| 258 |
-
analysis_id: int,
|
| 259 |
-
current_user: User = Depends(get_current_user),
|
| 260 |
-
db: Session = Depends(get_db)
|
| 261 |
-
):
|
| 262 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
|
| 263 |
-
if not analysis:
|
| 264 |
-
raise HTTPException(status_code=404, detail="Analysis not found")
|
| 265 |
-
|
| 266 |
-
pkg = StandardizedDataPackage.parse_raw(analysis.result_json)
|
| 267 |
-
pkg.analysis_id = analysis.id
|
| 268 |
-
return pkg
|
| 269 |
-
|
| 270 |
-
from fastapi.responses import FileResponse
|
| 271 |
-
@router.get("/history/{analysis_id}/download")
|
| 272 |
-
def download_original_file(
|
| 273 |
-
analysis_id: int,
|
| 274 |
-
current_user: User = Depends(get_current_user),
|
| 275 |
-
db: Session = Depends(get_db)
|
| 276 |
-
):
|
| 277 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
|
| 278 |
-
if not analysis or not analysis.stored_filename:
|
| 279 |
-
raise HTTPException(status_code=404, detail="File not found")
|
| 280 |
-
|
| 281 |
-
if not os.path.exists(analysis.stored_filename):
|
| 282 |
-
raise HTTPException(status_code=404, detail="File missing from server storage")
|
| 283 |
-
|
| 284 |
-
return FileResponse(
|
| 285 |
-
path=analysis.stored_filename,
|
| 286 |
-
filename=analysis.input_filename,
|
| 287 |
-
media_type='application/octet-stream'
|
| 288 |
-
)
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
@router.delete("/history/{analysis_id}")
|
| 292 |
-
def delete_analysis(
|
| 293 |
-
analysis_id: int,
|
| 294 |
-
current_user: User = Depends(get_current_user),
|
| 295 |
-
db: Session = Depends(get_db)
|
| 296 |
-
):
|
| 297 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
|
| 298 |
-
if not analysis:
|
| 299 |
-
raise HTTPException(status_code=404, detail="Analysis not found")
|
| 300 |
-
|
| 301 |
-
# Delete file from disk
|
| 302 |
-
if analysis.stored_filename and os.path.exists(analysis.stored_filename):
|
| 303 |
-
try:
|
| 304 |
-
os.remove(analysis.stored_filename)
|
| 305 |
-
except OSError:
|
| 306 |
-
pass # Continue even if file delete fails
|
| 307 |
-
|
| 308 |
-
db.delete(analysis)
|
| 309 |
-
db.commit()
|
| 310 |
-
return {"status": "success", "message": "Analysis deleted"}
|
| 311 |
-
|
| 312 |
-
class UpdateAnalysisRequest(BaseModel):
|
| 313 |
-
company_name: str
|
| 314 |
-
|
| 315 |
-
@router.patch("/history/{analysis_id}")
|
| 316 |
-
def update_analysis(
|
| 317 |
-
analysis_id: int,
|
| 318 |
-
request: UpdateAnalysisRequest,
|
| 319 |
-
current_user: User = Depends(get_current_user),
|
| 320 |
-
db: Session = Depends(get_db)
|
| 321 |
-
):
|
| 322 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
|
| 323 |
-
if not analysis:
|
| 324 |
-
raise HTTPException(status_code=404, detail="Analysis not found")
|
| 325 |
-
|
| 326 |
-
analysis.company_name = request.company_name
|
| 327 |
-
|
| 328 |
-
# Update the stored JSON to reflect new name (consistency)
|
| 329 |
-
try:
|
| 330 |
-
data = json.loads(analysis.result_json)
|
| 331 |
-
data['raw_data']['company_name'] = request.company_name
|
| 332 |
-
analysis.result_json = json.dumps(data)
|
| 333 |
-
except:
|
| 334 |
-
pass # If JSON parsing fails, just update DB record
|
| 335 |
-
|
| 336 |
-
db.commit()
|
| 337 |
-
return {"status": "success", "message": "Analysis updated", "company_name": analysis.company_name}
|
| 338 |
-
|
| 339 |
-
@router.post("/upload/pdf", response_model=StandardizedDataPackage)
|
| 340 |
-
async def analyze_pdf(
|
| 341 |
-
file: UploadFile = File(...),
|
| 342 |
-
current_user: User = Depends(get_current_user),
|
| 343 |
-
db: Session = Depends(get_db)
|
| 344 |
-
):
|
| 345 |
-
# Check upload limit
|
| 346 |
-
from app.services.feature_service import check_upload_limit, increment_upload_count
|
| 347 |
-
limit_check = check_upload_limit(db, current_user)
|
| 348 |
-
if not limit_check["can_upload"]:
|
| 349 |
-
raise HTTPException(
|
| 350 |
-
status_code=403,
|
| 351 |
-
detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
|
| 352 |
-
)
|
| 353 |
-
|
| 354 |
-
if not file.filename.endswith('.pdf'):
|
| 355 |
-
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .pdf file.")
|
| 356 |
-
|
| 357 |
-
import uuid
|
| 358 |
-
safe_filename = f"{uuid.uuid4()}_{file.filename}"
|
| 359 |
-
upload_dir = "uploads"
|
| 360 |
-
if not os.path.exists(upload_dir):
|
| 361 |
-
os.makedirs(upload_dir)
|
| 362 |
-
|
| 363 |
-
file_path = os.path.join(upload_dir, safe_filename)
|
| 364 |
-
|
| 365 |
-
try:
|
| 366 |
-
with open(file_path, "wb+") as file_object:
|
| 367 |
-
file_object.write(await file.read())
|
| 368 |
-
|
| 369 |
-
report = PDFParser.parse(file_path)
|
| 370 |
-
|
| 371 |
-
# Run Unified Analysis
|
| 372 |
-
# Select Engine based on User Preference
|
| 373 |
-
analyzer = AnalysisFactory.get_analyzer(current_user)
|
| 374 |
-
|
| 375 |
-
# Resolve all feature flags (Plan + Custom + Engine limits)
|
| 376 |
-
from app.services.feature_service import resolve_user_features
|
| 377 |
-
enabled_features = resolve_user_features(db, current_user)
|
| 378 |
-
|
| 379 |
-
analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
|
| 380 |
-
|
| 381 |
-
# Include risk_factors (which contain "Pain Point:" entries) in the insights array
|
| 382 |
-
risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
|
| 383 |
-
all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
|
| 384 |
-
|
| 385 |
-
result_package = StandardizedDataPackage(
|
| 386 |
-
raw_data=report,
|
| 387 |
-
kpis=analysis_result["kpis"],
|
| 388 |
-
risk_analysis=analysis_result["risk_analysis"],
|
| 389 |
-
health_score=analysis_result["health_score"],
|
| 390 |
-
insights=all_insights,
|
| 391 |
-
runway_forecast=analysis_result["runway_forecast"],
|
| 392 |
-
optimization_insights=analysis_result["optimization_insights"],
|
| 393 |
-
geo_analysis=analysis_result.get("geo_analysis")
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
# Save to DB
|
| 397 |
-
db_analysis = Analysis(
|
| 398 |
-
user_id=current_user.id,
|
| 399 |
-
company_name=report.company_name,
|
| 400 |
-
input_filename=file.filename,
|
| 401 |
-
stored_filename=file_path,
|
| 402 |
-
result_json=result_package.json()
|
| 403 |
-
)
|
| 404 |
-
db.add(db_analysis)
|
| 405 |
-
db.commit()
|
| 406 |
-
db.refresh(db_analysis)
|
| 407 |
-
|
| 408 |
-
result_package.analysis_id = db_analysis.id
|
| 409 |
-
|
| 410 |
-
# Increment upload count AFTER successful save
|
| 411 |
-
increment_upload_count(db, current_user)
|
| 412 |
-
|
| 413 |
-
return result_package
|
| 414 |
-
|
| 415 |
-
except Exception as e:
|
| 416 |
-
if os.path.exists(file_path):
|
| 417 |
-
os.remove(file_path)
|
| 418 |
-
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
# =============================================================================
|
| 422 |
-
# XLSX UPLOAD ENDPOINT
|
| 423 |
-
# =============================================================================
|
| 424 |
-
|
| 425 |
-
@router.post("/upload/xlsx", response_model=StandardizedDataPackage)
|
| 426 |
-
async def analyze_xlsx(
|
| 427 |
-
file: UploadFile = File(...),
|
| 428 |
-
current_user: User = Depends(get_current_user),
|
| 429 |
-
db: Session = Depends(get_db)
|
| 430 |
-
):
|
| 431 |
-
"""Upload and analyze an Excel (.xlsx, .xls) file."""
|
| 432 |
-
# Check upload limit
|
| 433 |
-
from app.services.feature_service import check_upload_limit, increment_upload_count
|
| 434 |
-
limit_check = check_upload_limit(db, current_user)
|
| 435 |
-
if not limit_check["can_upload"]:
|
| 436 |
-
raise HTTPException(
|
| 437 |
-
status_code=403,
|
| 438 |
-
detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
|
| 439 |
-
)
|
| 440 |
-
|
| 441 |
-
if not (file.filename.endswith('.xlsx') or file.filename.endswith('.xls')):
|
| 442 |
-
raise HTTPException(status_code=400, detail="Invalid file type. Please upload an .xlsx or .xls file.")
|
| 443 |
-
|
| 444 |
-
import uuid
|
| 445 |
-
safe_filename = f"{uuid.uuid4()}_{file.filename}"
|
| 446 |
-
upload_dir = "uploads"
|
| 447 |
-
if not os.path.exists(upload_dir):
|
| 448 |
-
os.makedirs(upload_dir)
|
| 449 |
-
|
| 450 |
-
file_path = os.path.join(upload_dir, safe_filename)
|
| 451 |
-
|
| 452 |
-
try:
|
| 453 |
-
with open(file_path, "wb+") as file_object:
|
| 454 |
-
file_object.write(await file.read())
|
| 455 |
-
|
| 456 |
-
# Use XLSX Parser
|
| 457 |
-
from app.services.ingestion.parser_xlsx import XLSXParser
|
| 458 |
-
report = XLSXParser.parse(file_path)
|
| 459 |
-
|
| 460 |
-
# Run Unified Analysis
|
| 461 |
-
# Select Engine based on User Preference
|
| 462 |
-
analyzer = AnalysisFactory.get_analyzer(current_user)
|
| 463 |
-
|
| 464 |
-
# Resolve all feature flags (Plan + Custom + Engine limits)
|
| 465 |
-
from app.services.feature_service import resolve_user_features
|
| 466 |
-
enabled_features = resolve_user_features(db, current_user)
|
| 467 |
-
|
| 468 |
-
analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
|
| 469 |
-
|
| 470 |
-
risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
|
| 471 |
-
all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
|
| 472 |
-
|
| 473 |
-
result_package = StandardizedDataPackage(
|
| 474 |
-
raw_data=report,
|
| 475 |
-
kpis=analysis_result["kpis"],
|
| 476 |
-
risk_analysis=analysis_result["risk_analysis"],
|
| 477 |
-
health_score=analysis_result["health_score"],
|
| 478 |
-
insights=all_insights,
|
| 479 |
-
runway_forecast=analysis_result["runway_forecast"],
|
| 480 |
-
optimization_insights=analysis_result["optimization_insights"],
|
| 481 |
-
geo_analysis=analysis_result.get("geo_analysis")
|
| 482 |
-
)
|
| 483 |
-
|
| 484 |
-
# Save to DB
|
| 485 |
-
db_analysis = Analysis(
|
| 486 |
-
user_id=current_user.id,
|
| 487 |
-
company_name=report.company_name,
|
| 488 |
-
input_filename=file.filename,
|
| 489 |
-
stored_filename=file_path,
|
| 490 |
-
result_json=result_package.json()
|
| 491 |
-
)
|
| 492 |
-
db.add(db_analysis)
|
| 493 |
-
db.commit()
|
| 494 |
-
db.refresh(db_analysis)
|
| 495 |
-
|
| 496 |
-
result_package.analysis_id = db_analysis.id
|
| 497 |
-
|
| 498 |
-
# Increment upload count
|
| 499 |
-
increment_upload_count(db, current_user)
|
| 500 |
-
|
| 501 |
-
return result_package
|
| 502 |
-
|
| 503 |
-
except Exception as e:
|
| 504 |
-
if os.path.exists(file_path):
|
| 505 |
-
os.remove(file_path)
|
| 506 |
-
raise HTTPException(status_code=500, detail=f"XLSX Analysis failed: {str(e)}")
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
# =============================================================================
|
| 510 |
-
# BULK DELETE ENDPOINTS
|
| 511 |
-
# =============================================================================
|
| 512 |
-
|
| 513 |
-
class BulkDeleteRequest(BaseModel):
|
| 514 |
-
ids: list[int]
|
| 515 |
-
|
| 516 |
-
@router.delete("/history/bulk-delete")
|
| 517 |
-
def bulk_delete_analyses(
|
| 518 |
-
request: BulkDeleteRequest,
|
| 519 |
-
current_user: User = Depends(get_current_user),
|
| 520 |
-
db: Session = Depends(get_db)
|
| 521 |
-
):
|
| 522 |
-
"""Delete multiple analyses at once."""
|
| 523 |
-
deleted_count = 0
|
| 524 |
-
errors = []
|
| 525 |
-
|
| 526 |
-
for analysis_id in request.ids:
|
| 527 |
-
analysis = db.query(Analysis).filter(
|
| 528 |
-
Analysis.id == analysis_id,
|
| 529 |
-
Analysis.user_id == current_user.id
|
| 530 |
-
).first()
|
| 531 |
-
|
| 532 |
-
if not analysis:
|
| 533 |
-
errors.append(f"Analysis {analysis_id} not found")
|
| 534 |
-
continue
|
| 535 |
-
|
| 536 |
-
# Delete file from disk
|
| 537 |
-
if analysis.stored_filename and os.path.exists(analysis.stored_filename):
|
| 538 |
-
try:
|
| 539 |
-
os.remove(analysis.stored_filename)
|
| 540 |
-
except OSError:
|
| 541 |
-
pass
|
| 542 |
-
|
| 543 |
-
db.delete(analysis)
|
| 544 |
-
deleted_count += 1
|
| 545 |
-
|
| 546 |
-
db.commit()
|
| 547 |
-
|
| 548 |
-
return {
|
| 549 |
-
"status": "success",
|
| 550 |
-
"deleted_count": deleted_count,
|
| 551 |
-
"errors": errors if errors else None
|
| 552 |
-
}
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
class DateRangeDeleteRequest(BaseModel):
|
| 556 |
-
start_date: str # YYYY-MM-DD
|
| 557 |
-
end_date: str # YYYY-MM-DD
|
| 558 |
-
|
| 559 |
-
@router.delete("/history/delete-range")
|
| 560 |
-
def delete_analyses_in_range(
|
| 561 |
-
request: DateRangeDeleteRequest,
|
| 562 |
-
current_user: User = Depends(get_current_user),
|
| 563 |
-
db: Session = Depends(get_db)
|
| 564 |
-
):
|
| 565 |
-
"""Delete all analyses within a date range."""
|
| 566 |
-
from datetime import datetime
|
| 567 |
-
|
| 568 |
-
try:
|
| 569 |
-
start = datetime.strptime(request.start_date, "%Y-%m-%d")
|
| 570 |
-
end = datetime.strptime(request.end_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59)
|
| 571 |
-
except ValueError:
|
| 572 |
-
raise HTTPException(status_code=400, detail="Invalid date format. Use YYYY-MM-DD.")
|
| 573 |
-
|
| 574 |
-
# Find analyses in range
|
| 575 |
-
analyses = db.query(Analysis).filter(
|
| 576 |
-
Analysis.user_id == current_user.id,
|
| 577 |
-
Analysis.timestamp >= start,
|
| 578 |
-
Analysis.timestamp <= end
|
| 579 |
-
).all()
|
| 580 |
-
|
| 581 |
-
deleted_count = 0
|
| 582 |
-
for analysis in analyses:
|
| 583 |
-
if analysis.stored_filename and os.path.exists(analysis.stored_filename):
|
| 584 |
-
try:
|
| 585 |
-
os.remove(analysis.stored_filename)
|
| 586 |
-
except OSError:
|
| 587 |
-
pass
|
| 588 |
-
db.delete(analysis)
|
| 589 |
-
deleted_count += 1
|
| 590 |
-
|
| 591 |
-
db.commit()
|
| 592 |
-
|
| 593 |
-
return {
|
| 594 |
-
"status": "success",
|
| 595 |
-
"deleted_count": deleted_count,
|
| 596 |
-
"date_range": f"{request.start_date} to {request.end_date}"
|
| 597 |
-
}
|
| 598 |
-
|
| 599 |
-
class SimulationRequest(BaseModel):
|
| 600 |
-
data: StandardizedDataPackage
|
| 601 |
-
delta_revenue: float = 0.0
|
| 602 |
-
delta_cogs: float = 0.0
|
| 603 |
-
delta_payroll: float = 0.0
|
| 604 |
-
delta_marketing: float = 0.0
|
| 605 |
-
delta_fixed_costs: float = 0.0
|
| 606 |
-
|
| 607 |
-
@router.post("/simulate", response_model=StandardizedDataPackage)
|
| 608 |
-
async def run_simulation(request: SimulationRequest, user: str = Depends(get_current_user)):
|
| 609 |
-
return SimulationService.run_simulation(
|
| 610 |
-
original_data=request.data.raw_data,
|
| 611 |
-
delta_revenue_percent=request.delta_revenue,
|
| 612 |
-
delta_cogs_percent=request.delta_cogs,
|
| 613 |
-
delta_payroll_percent=request.delta_payroll,
|
| 614 |
-
delta_marketing_percent=request.delta_marketing,
|
| 615 |
-
delta_fixed_costs_percent=request.delta_fixed_costs
|
| 616 |
-
)
|
| 617 |
-
|
| 618 |
-
@router.get("/history/{analysis_id}/export/pdf")
|
| 619 |
-
def export_analysis_pdf(
|
| 620 |
-
analysis_id: int,
|
| 621 |
-
current_user: User = Depends(get_current_user),
|
| 622 |
-
db: Session = Depends(get_db)
|
| 623 |
-
):
|
| 624 |
-
from fastapi.responses import FileResponse
|
| 625 |
-
|
| 626 |
-
analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
|
| 627 |
-
if not analysis:
|
| 628 |
-
raise HTTPException(status_code=404, detail="Analysis not found")
|
| 629 |
-
|
| 630 |
-
# parse stored json
|
| 631 |
-
try:
|
| 632 |
-
data = StandardizedDataPackage.parse_raw(analysis.result_json)
|
| 633 |
-
except Exception as e:
|
| 634 |
-
raise HTTPException(status_code=500, detail=f"Data corruption: {str(e)}")
|
| 635 |
-
|
| 636 |
-
# Generate PDF
|
| 637 |
-
# We use /tmp for now, simplified
|
| 638 |
-
safe_name = "".join(x for x in data.raw_data.company_name if x.isalnum() or x in " _-")
|
| 639 |
-
filename = f"/tmp/{safe_name}_{analysis.id}_report.pdf"
|
| 640 |
-
|
| 641 |
-
PDFReporter.generate(data, filename)
|
| 642 |
-
|
| 643 |
-
from datetime import datetime
|
| 644 |
-
date_str = datetime.now().strftime("%Y-%m-%d")
|
| 645 |
-
return FileResponse(filename, media_type='application/pdf', filename=f"Visi-Insight Report - {data.raw_data.company_name} - {date_str}.pdf")
|
| 646 |
-
|
| 647 |
-
@router.post("/ai-cfo", response_model=str)
|
| 648 |
-
async def get_ai_summary(data: StandardizedDataPackage, user: str = Depends(get_current_user)):
|
| 649 |
-
from app.services.intelligence.ai_cfo import AICFOService
|
| 650 |
-
return AICFOService.generate_executive_summary(data)
|
| 651 |
-
|
| 652 |
-
@router.post("/chat", response_model=ChatResponse)
|
| 653 |
-
async def chat_with_data(request: ChatRequest, user: str = Depends(get_current_user)):
|
| 654 |
-
# Note: In a real app, 'data_context' would be retrieved from a session or vector DB
|
| 655 |
-
# For this stateless scaffold, we assume we want to query a mock global context or previously uploaded file.
|
| 656 |
-
# To keep it simple for the frontend demo, we will accept the data in the request or just mock the context access
|
| 657 |
-
# since we don't have a persistent session store implemented yet.
|
| 658 |
-
|
| 659 |
-
# Check if a file was recently uploaded (using a global for demo simplicity, or pass mock)
|
| 660 |
-
# Ideally, we'd pass the DataPackage in the request, but it's too big.
|
| 661 |
-
# We will instantiate a dummy context if none exists, or rely on client sending relevant context.
|
| 662 |
-
|
| 663 |
-
# PROPER IMPLEMENTATION:
|
| 664 |
-
# 1. User uploads file -> Backend stores Vector Index ID in User Session.
|
| 665 |
-
# 2. /chat -> retrieves Index ID -> Queries Vector DB.
|
| 666 |
-
|
| 667 |
-
# MOCK IMPLEMENTATION:
|
| 668 |
-
from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
|
| 669 |
-
from datetime import date
|
| 670 |
-
|
| 671 |
-
# Create a dummy context for the scaffold to prove the endpoint works
|
| 672 |
-
# In production, this would be retrieved from session or vector DB
|
| 673 |
-
dummy_data = StandardizedDataPackage(
|
| 674 |
-
raw_data=FinancialReport(
|
| 675 |
-
company_name="Demo Corp",
|
| 676 |
-
period_end=date.today(),
|
| 677 |
-
income_statement=IncomeStatementStandard(revenue=1200000, net_income=240000, cogs=600000),
|
| 678 |
-
balance_sheet=BalanceSheetStandard(),
|
| 679 |
-
cash_flow=CashFlowStandard()
|
| 680 |
-
),
|
| 681 |
-
kpis=KPIMetrics(net_margin=20.0),
|
| 682 |
-
risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
|
| 683 |
-
health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
|
| 684 |
-
insights=["Automated Report Generation Successful"],
|
| 685 |
-
optimization_insights=None # Should be populated normally
|
| 686 |
-
)
|
| 687 |
-
|
| 688 |
-
from app.services.intelligence.gemini_service import GeminiService
|
| 689 |
-
return GeminiService.query(request, dummy_data)
|
| 690 |
-
|
| 691 |
-
@router.get("/export/pptx/{company_name}")
|
| 692 |
-
async def export_pptx(company_name: str):
|
| 693 |
-
from fastapi.responses import FileResponse
|
| 694 |
-
|
| 695 |
-
dummy_data = StandardizedDataPackage(
|
| 696 |
-
raw_data=FinancialReport(
|
| 697 |
-
company_name=company_name,
|
| 698 |
-
period_end=date.today(),
|
| 699 |
-
income_statement=IncomeStatementStandard(revenue=1000000, net_income=200000, cogs=500000),
|
| 700 |
-
balance_sheet=BalanceSheetStandard(),
|
| 701 |
-
cash_flow=CashFlowStandard()
|
| 702 |
-
),
|
| 703 |
-
kpis=KPIMetrics(net_margin=20.0),
|
| 704 |
-
risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
|
| 705 |
-
health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
|
| 706 |
-
insights=["Automated Report Generation Successful"]
|
| 707 |
-
)
|
| 708 |
-
|
| 709 |
-
filename = f"/tmp/{company_name}_presentation.pptx"
|
| 710 |
-
PPTXReporter.generate(dummy_data, filename)
|
| 711 |
-
|
| 712 |
-
return FileResponse(filename, media_type='application/vnd.openxmlformats-officedocument.presentationml.presentation', filename=f"{company_name}_presentation.pptx")
|
| 713 |
-
|
| 714 |
-
class EngineUpdate(BaseModel):
|
| 715 |
-
engine: str
|
| 716 |
-
|
| 717 |
-
@router.put("/admin/users/{user_id}/engine")
|
| 718 |
-
def update_user_engine(
|
| 719 |
-
user_id: int,
|
| 720 |
-
update: EngineUpdate,
|
| 721 |
-
admin: User = Depends(get_current_admin),
|
| 722 |
-
db: Session = Depends(get_db)
|
| 723 |
-
):
|
| 724 |
-
user = db.query(User).filter(User.id == user_id).first()
|
| 725 |
-
if not user:
|
| 726 |
-
raise HTTPException(status_code=404, detail="User not found")
|
| 727 |
-
|
| 728 |
-
if update.engine not in ["v1", "v2"]:
|
| 729 |
-
raise HTTPException(status_code=400, detail="Invalid engine. Use 'v1' or 'v2'.")
|
| 730 |
-
|
| 731 |
-
user.preferred_engine = update.engine
|
| 732 |
-
db.commit()
|
| 733 |
-
return {"status": "success", "engine": user.preferred_engine}
|
| 734 |
-
|
| 735 |
-
@router.get("/public-config")
|
| 736 |
-
def get_public_config(db: Session = Depends(get_db)):
|
| 737 |
-
"""Get configuration for Guest/Public users."""
|
| 738 |
-
from app.services.feature_service import get_effective_features
|
| 739 |
-
return {
|
| 740 |
-
"guest_features": get_effective_features(db, "Guest"),
|
| 741 |
-
"upload_limit": 2
|
| 742 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/__init__.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Core Configuration Package
|
| 3 |
-
|
| 4 |
-
This package contains application-wide configuration and utilities.
|
| 5 |
-
|
| 6 |
-
## Modules
|
| 7 |
-
|
| 8 |
-
- `config.py` - Environment variables and settings
|
| 9 |
-
- `database.py` - SQLAlchemy engine and session
|
| 10 |
-
- `security.py` - JWT token creation/validation
|
| 11 |
-
- `feature_registry.py` - Centralized feature definitions (auto-discoverable)
|
| 12 |
-
- `plan_config.py` - Plan limits and default feature sets
|
| 13 |
-
|
| 14 |
-
## Feature System Architecture
|
| 15 |
-
|
| 16 |
-
The feature system uses a layered approach:
|
| 17 |
-
|
| 18 |
-
1. **Feature Registry** (`feature_registry.py`)
|
| 19 |
-
- Defines ALL controllable features
|
| 20 |
-
- Features auto-appear in admin console
|
| 21 |
-
- Organized by category for easy navigation
|
| 22 |
-
|
| 23 |
-
2. **Plan Config** (`plan_config.py`)
|
| 24 |
-
- Default features per plan tier
|
| 25 |
-
- Upload limits per plan
|
| 26 |
-
- Wildcard "*" for unlimited access
|
| 27 |
-
|
| 28 |
-
3. **Admin Overrides** (via `models/feature_flags.py`)
|
| 29 |
-
- Stored in database
|
| 30 |
-
- Takes precedence over defaults
|
| 31 |
-
- Managed via admin API
|
| 32 |
-
|
| 33 |
-
## Adding New Features
|
| 34 |
-
|
| 35 |
-
```python
|
| 36 |
-
# In feature_registry.py, add to FEATURE_REGISTRY:
|
| 37 |
-
Feature(
|
| 38 |
-
id="new_feature_id",
|
| 39 |
-
name="New Feature Name",
|
| 40 |
-
description="What this feature does",
|
| 41 |
-
category=FeatureCategory.CORE_METRICS # Pick appropriate category
|
| 42 |
-
)
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
The feature will automatically:
|
| 46 |
-
- Appear in admin console UI
|
| 47 |
-
- Be toggleable per plan
|
| 48 |
-
- Respect plan defaults until overridden
|
| 49 |
-
"""
|
| 50 |
-
|
| 51 |
-
from app.core.feature_registry import FEATURE_REGISTRY, Feature, FeatureCategory
|
| 52 |
-
from app.core.plan_config import PLAN_DEFAULTS, get_plan_config, get_default_features
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/config.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 2 |
-
from pydantic import field_validator
|
| 3 |
-
from typing import List, Union, Optional
|
| 4 |
-
|
| 5 |
-
class Settings(BaseSettings):
|
| 6 |
-
# Application Config
|
| 7 |
-
PROJECT_NAME: str = "Visique API"
|
| 8 |
-
VERSION: str = "0.1.0"
|
| 9 |
-
API_V1_STR: str = "/api/v1"
|
| 10 |
-
|
| 11 |
-
# Security
|
| 12 |
-
SECRET_KEY: str # Required in production
|
| 13 |
-
ALGORITHM: str = "HS256"
|
| 14 |
-
ACCESS_TOKEN_EXPIRE_MINUTES: int = 1440 # 24 hours for better UX
|
| 15 |
-
|
| 16 |
-
# Database
|
| 17 |
-
DATABASE_URL: str # PostgreSQL URL
|
| 18 |
-
|
| 19 |
-
# CORS
|
| 20 |
-
ALLOWED_ORIGINS: Union[List[str], str] = [
|
| 21 |
-
"http://localhost:3000",
|
| 22 |
-
"http://127.0.0.1:3000",
|
| 23 |
-
"https://visique-testing.vercel.app",
|
| 24 |
-
"https://visique-frontend.vercel.app"
|
| 25 |
-
]
|
| 26 |
-
|
| 27 |
-
@field_validator("ALLOWED_ORIGINS", mode="before")
|
| 28 |
-
@classmethod
|
| 29 |
-
def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]:
|
| 30 |
-
if isinstance(v, str) and not v.startswith("["):
|
| 31 |
-
return [i.strip() for i in v.split(",")]
|
| 32 |
-
elif isinstance(v, str) and v.startswith("["):
|
| 33 |
-
import json
|
| 34 |
-
return json.loads(v)
|
| 35 |
-
elif isinstance(v, list):
|
| 36 |
-
return v
|
| 37 |
-
raise ValueError(v)
|
| 38 |
-
|
| 39 |
-
# Stripe
|
| 40 |
-
STRIPE_SECRET_KEY: Optional[str] = None
|
| 41 |
-
STRIPE_PUBLISHABLE_KEY: Optional[str] = None
|
| 42 |
-
STRIPE_WEBHOOK_SECRET: Optional[str] = None
|
| 43 |
-
|
| 44 |
-
# Deployment
|
| 45 |
-
ENVIRONMENT: str = "development"
|
| 46 |
-
|
| 47 |
-
# Dolphin PDF Extraction
|
| 48 |
-
DOLPHIN_MODEL_PATH: Optional[str] = None # Auto-downloads if None
|
| 49 |
-
DOLPHIN_DEVICE: str = "auto" # "auto" (CUDA > MPS > CPU) | "cuda" | "mps" | "cpu"
|
| 50 |
-
DOLPHIN_MAX_BATCH_SIZE: int = 4
|
| 51 |
-
DOLPHIN_AUTO_DOWNLOAD: bool = True
|
| 52 |
-
|
| 53 |
-
# Dolphin Remote Service (Optional - for distributed setup)
|
| 54 |
-
DOLPHIN_API_URL: Optional[str] = None
|
| 55 |
-
DOLPHIN_API_KEY: Optional[str] = None
|
| 56 |
-
|
| 57 |
-
model_config = SettingsConfigDict(env_file=".env", case_sensitive=True, extra="ignore")
|
| 58 |
-
|
| 59 |
-
settings = Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/database.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
from sqlalchemy import create_engine
|
| 2 |
-
from sqlalchemy.ext.declarative import declarative_base
|
| 3 |
-
from sqlalchemy.orm import sessionmaker
|
| 4 |
-
|
| 5 |
-
from app.core.config import settings
|
| 6 |
-
|
| 7 |
-
SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL
|
| 8 |
-
|
| 9 |
-
# Fix for Render/SQLAlchemy postgres:// scheme
|
| 10 |
-
if SQLALCHEMY_DATABASE_URL.startswith("postgres://"):
|
| 11 |
-
SQLALCHEMY_DATABASE_URL = SQLALCHEMY_DATABASE_URL.replace("postgres://", "postgresql://", 1)
|
| 12 |
-
|
| 13 |
-
engine = create_engine(
|
| 14 |
-
SQLALCHEMY_DATABASE_URL,
|
| 15 |
-
pool_pre_ping=True
|
| 16 |
-
)
|
| 17 |
-
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 18 |
-
|
| 19 |
-
Base = declarative_base()
|
| 20 |
-
|
| 21 |
-
def get_db():
|
| 22 |
-
db = SessionLocal()
|
| 23 |
-
try:
|
| 24 |
-
yield db
|
| 25 |
-
finally:
|
| 26 |
-
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/feature_registry.py
DELETED
|
@@ -1,255 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Feature Registry - Auto-Discoverable Feature System
|
| 3 |
-
|
| 4 |
-
Add new features here and they will automatically appear in the admin console.
|
| 5 |
-
Each feature belongs to a category and can be toggled per plan.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from enum import Enum
|
| 9 |
-
from dataclasses import dataclass, field
|
| 10 |
-
from typing import List, Dict, Optional
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class FeatureCategory(Enum):
|
| 14 |
-
"""Categories for organizing features in admin console"""
|
| 15 |
-
CORE_METRICS = "Core Metrics"
|
| 16 |
-
RISK_ANALYSIS = "Risk Analysis"
|
| 17 |
-
FORECASTING = "Forecasting"
|
| 18 |
-
AI_INTELLIGENCE = "AI Intelligence"
|
| 19 |
-
INTERACTIVE = "Interactive Tools"
|
| 20 |
-
EXPORTS = "Exports & Reports"
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
@dataclass
|
| 24 |
-
class Feature:
|
| 25 |
-
"""
|
| 26 |
-
Represents a controllable feature in the system.
|
| 27 |
-
|
| 28 |
-
Attributes:
|
| 29 |
-
id: Unique identifier used in code and API
|
| 30 |
-
name: Human-readable name for admin console
|
| 31 |
-
category: Grouping category
|
| 32 |
-
description: Brief description of the feature
|
| 33 |
-
default_enabled: Whether enabled by default for new plans
|
| 34 |
-
"""
|
| 35 |
-
id: str
|
| 36 |
-
name: str
|
| 37 |
-
category: FeatureCategory
|
| 38 |
-
description: str
|
| 39 |
-
default_enabled: bool = True
|
| 40 |
-
memory_cost_mb: int = 5 # Estimated RAM usage in MB
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
# =============================================================================
|
| 45 |
-
# FEATURE REGISTRY - ADD NEW FEATURES HERE
|
| 46 |
-
# =============================================================================
|
| 47 |
-
# When adding new financial model outputs, add a Feature entry below.
|
| 48 |
-
# It will automatically appear in the admin console under the correct category.
|
| 49 |
-
# =============================================================================
|
| 50 |
-
|
| 51 |
-
FEATURE_REGISTRY: List[Feature] = [
|
| 52 |
-
# -------------------------------------------------------------------------
|
| 53 |
-
# Core Metrics
|
| 54 |
-
# -------------------------------------------------------------------------
|
| 55 |
-
Feature(
|
| 56 |
-
id="kpi_margins",
|
| 57 |
-
name="Profit Margins (Gross/Operating/Net)",
|
| 58 |
-
category=FeatureCategory.CORE_METRICS,
|
| 59 |
-
description="Core margin KPIs from income statement",
|
| 60 |
-
memory_cost_mb=2
|
| 61 |
-
),
|
| 62 |
-
Feature(
|
| 63 |
-
id="kpi_ratios",
|
| 64 |
-
name="Financial Ratios",
|
| 65 |
-
category=FeatureCategory.CORE_METRICS,
|
| 66 |
-
description="Current ratio, debt-to-equity, quick ratio",
|
| 67 |
-
memory_cost_mb=2
|
| 68 |
-
),
|
| 69 |
-
Feature(
|
| 70 |
-
id="health_score",
|
| 71 |
-
name="Health Score Dashboard",
|
| 72 |
-
category=FeatureCategory.CORE_METRICS,
|
| 73 |
-
description="Overall financial health scoring (stability, profitability, growth, efficiency)"
|
| 74 |
-
),
|
| 75 |
-
|
| 76 |
-
# -------------------------------------------------------------------------
|
| 77 |
-
# Risk Analysis
|
| 78 |
-
# -------------------------------------------------------------------------
|
| 79 |
-
Feature(
|
| 80 |
-
id="risk_score",
|
| 81 |
-
name="Risk Score",
|
| 82 |
-
category=FeatureCategory.RISK_ANALYSIS,
|
| 83 |
-
description="Aggregate risk scoring (0-100)",
|
| 84 |
-
memory_cost_mb=5
|
| 85 |
-
),
|
| 86 |
-
Feature(
|
| 87 |
-
id="risk_factors",
|
| 88 |
-
name="Risk Factor Breakdown",
|
| 89 |
-
category=FeatureCategory.RISK_ANALYSIS,
|
| 90 |
-
description="Detailed list of identified risk factors"
|
| 91 |
-
),
|
| 92 |
-
Feature(
|
| 93 |
-
id="liquidity_risk",
|
| 94 |
-
name="Liquidity Risk",
|
| 95 |
-
category=FeatureCategory.RISK_ANALYSIS,
|
| 96 |
-
description="Cash flow and working capital risk assessment"
|
| 97 |
-
),
|
| 98 |
-
Feature(
|
| 99 |
-
id="solvency_risk",
|
| 100 |
-
name="Solvency Risk",
|
| 101 |
-
category=FeatureCategory.RISK_ANALYSIS,
|
| 102 |
-
description="Long-term debt sustainability analysis"
|
| 103 |
-
),
|
| 104 |
-
|
| 105 |
-
# -------------------------------------------------------------------------
|
| 106 |
-
# Forecasting
|
| 107 |
-
# -------------------------------------------------------------------------
|
| 108 |
-
Feature(
|
| 109 |
-
id="runway_forecast",
|
| 110 |
-
name="Cash Runway Forecast",
|
| 111 |
-
category=FeatureCategory.FORECASTING,
|
| 112 |
-
description="30/60/90 day cash projections"
|
| 113 |
-
),
|
| 114 |
-
Feature(
|
| 115 |
-
id="burn_rate",
|
| 116 |
-
name="Burn Rate Analysis",
|
| 117 |
-
category=FeatureCategory.FORECASTING,
|
| 118 |
-
description="Monthly cash burn rate calculation"
|
| 119 |
-
),
|
| 120 |
-
Feature(
|
| 121 |
-
id="optimization_insights",
|
| 122 |
-
name="Optimization Insights",
|
| 123 |
-
category=FeatureCategory.FORECASTING,
|
| 124 |
-
description="Dead zones, peak premiums, cost optimization"
|
| 125 |
-
),
|
| 126 |
-
Feature(
|
| 127 |
-
id="budget_variance",
|
| 128 |
-
name="Budget Variance Analysis",
|
| 129 |
-
category=FeatureCategory.FORECASTING,
|
| 130 |
-
description="Target vs actual comparison"
|
| 131 |
-
),
|
| 132 |
-
|
| 133 |
-
# -------------------------------------------------------------------------
|
| 134 |
-
# AI Intelligence
|
| 135 |
-
# -------------------------------------------------------------------------
|
| 136 |
-
Feature(
|
| 137 |
-
id="ai_cfo",
|
| 138 |
-
name="AI CFO Chat",
|
| 139 |
-
category=FeatureCategory.AI_INTELLIGENCE,
|
| 140 |
-
description="Conversational AI financial advisor",
|
| 141 |
-
memory_cost_mb=80
|
| 142 |
-
),
|
| 143 |
-
Feature(
|
| 144 |
-
id="ai_summary",
|
| 145 |
-
name="AI Executive Summary",
|
| 146 |
-
category=FeatureCategory.AI_INTELLIGENCE,
|
| 147 |
-
description="Auto-generated narrative insights",
|
| 148 |
-
memory_cost_mb=60
|
| 149 |
-
),
|
| 150 |
-
Feature(
|
| 151 |
-
id="geo_insights",
|
| 152 |
-
name="Geo-Strategic Insights",
|
| 153 |
-
category=FeatureCategory.AI_INTELLIGENCE,
|
| 154 |
-
description="Location-based market analysis",
|
| 155 |
-
memory_cost_mb=150
|
| 156 |
-
),
|
| 157 |
-
Feature(
|
| 158 |
-
id="intelligence_card",
|
| 159 |
-
name="Strategic Intelligence Card",
|
| 160 |
-
category=FeatureCategory.AI_INTELLIGENCE,
|
| 161 |
-
description="AI-powered strategic recommendations",
|
| 162 |
-
memory_cost_mb=50
|
| 163 |
-
),
|
| 164 |
-
|
| 165 |
-
# -------------------------------------------------------------------------
|
| 166 |
-
# Interactive Tools
|
| 167 |
-
# -------------------------------------------------------------------------
|
| 168 |
-
Feature(
|
| 169 |
-
id="what_if_slider",
|
| 170 |
-
name="What-If Simulator",
|
| 171 |
-
category=FeatureCategory.INTERACTIVE,
|
| 172 |
-
description="Revenue/cost scenario modeling with sliders"
|
| 173 |
-
),
|
| 174 |
-
Feature(
|
| 175 |
-
id="interactive_charts",
|
| 176 |
-
name="Interactive Charts",
|
| 177 |
-
category=FeatureCategory.INTERACTIVE,
|
| 178 |
-
description="Zoomable, hoverable data visualizations"
|
| 179 |
-
),
|
| 180 |
-
Feature(
|
| 181 |
-
id="trend_comparison",
|
| 182 |
-
name="Trend Comparison",
|
| 183 |
-
category=FeatureCategory.INTERACTIVE,
|
| 184 |
-
description="Period-over-period analysis"
|
| 185 |
-
),
|
| 186 |
-
|
| 187 |
-
# -------------------------------------------------------------------------
|
| 188 |
-
# Exports & Reports
|
| 189 |
-
# -------------------------------------------------------------------------
|
| 190 |
-
Feature(
|
| 191 |
-
id="pdf_export",
|
| 192 |
-
name="PDF Report Export",
|
| 193 |
-
category=FeatureCategory.EXPORTS,
|
| 194 |
-
description="Downloadable PDF financial report"
|
| 195 |
-
),
|
| 196 |
-
Feature(
|
| 197 |
-
id="pptx_export",
|
| 198 |
-
name="PowerPoint Export",
|
| 199 |
-
category=FeatureCategory.EXPORTS,
|
| 200 |
-
description="Presentation-ready slides"
|
| 201 |
-
),
|
| 202 |
-
Feature(
|
| 203 |
-
id="csv_export",
|
| 204 |
-
name="Data Export (CSV)",
|
| 205 |
-
category=FeatureCategory.EXPORTS,
|
| 206 |
-
description="Raw data download for further analysis"
|
| 207 |
-
),
|
| 208 |
-
]
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# =============================================================================
|
| 212 |
-
# HELPER FUNCTIONS
|
| 213 |
-
# =============================================================================
|
| 214 |
-
|
| 215 |
-
def get_all_features() -> List[Feature]:
|
| 216 |
-
"""Returns all registered features."""
|
| 217 |
-
return FEATURE_REGISTRY
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
def get_feature_by_id(feature_id: str) -> Optional[Feature]:
|
| 221 |
-
"""Get a specific feature by its ID."""
|
| 222 |
-
for feature in FEATURE_REGISTRY:
|
| 223 |
-
if feature.id == feature_id:
|
| 224 |
-
return feature
|
| 225 |
-
return None
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
def get_all_feature_ids() -> List[str]:
|
| 229 |
-
"""Returns list of all feature IDs."""
|
| 230 |
-
return [f.id for f in FEATURE_REGISTRY]
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
def get_features_by_category() -> Dict[str, List[Feature]]:
|
| 234 |
-
"""Returns features grouped by category name."""
|
| 235 |
-
result: Dict[str, List[Feature]] = {}
|
| 236 |
-
for cat in FeatureCategory:
|
| 237 |
-
features = [f for f in FEATURE_REGISTRY if f.category == cat]
|
| 238 |
-
if features:
|
| 239 |
-
result[cat.value] = features
|
| 240 |
-
return result
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def get_default_enabled_features() -> List[str]:
|
| 244 |
-
"""Returns IDs of features enabled by default."""
|
| 245 |
-
return [f.id for f in FEATURE_REGISTRY if f.default_enabled]
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
def validate_feature_ids(feature_ids: List[str]) -> List[str]:
|
| 249 |
-
"""
|
| 250 |
-
Validates a list of feature IDs against the registry.
|
| 251 |
-
Returns list of invalid IDs (empty if all valid).
|
| 252 |
-
"""
|
| 253 |
-
valid_ids = set(get_all_feature_ids())
|
| 254 |
-
invalid = [fid for fid in feature_ids if fid not in valid_ids]
|
| 255 |
-
return invalid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/migrations.py
DELETED
|
@@ -1,111 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Automatic Schema Migration Utility
|
| 3 |
-
|
| 4 |
-
This module runs at startup to ensure database columns match the SQLAlchemy models.
|
| 5 |
-
It adds any missing columns automatically, preventing 'UndefinedColumn' errors
|
| 6 |
-
in production when new fields are added to models.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
from sqlalchemy import inspect, text
|
| 10 |
-
from sqlalchemy.engine import Engine
|
| 11 |
-
import logging
|
| 12 |
-
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def get_model_columns(model_class):
|
| 17 |
-
"""Extract column definitions from a SQLAlchemy model class."""
|
| 18 |
-
from sqlalchemy import Column
|
| 19 |
-
columns = {}
|
| 20 |
-
for attr_name in dir(model_class):
|
| 21 |
-
attr = getattr(model_class, attr_name, None)
|
| 22 |
-
if hasattr(attr, 'property') and hasattr(attr.property, 'columns'):
|
| 23 |
-
col = attr.property.columns[0]
|
| 24 |
-
columns[col.name] = col
|
| 25 |
-
return columns
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def get_db_columns(engine: Engine, table_name: str):
|
| 29 |
-
"""Get existing column names from the database table."""
|
| 30 |
-
inspector = inspect(engine)
|
| 31 |
-
try:
|
| 32 |
-
return {col['name'] for col in inspector.get_columns(table_name)}
|
| 33 |
-
except Exception:
|
| 34 |
-
return set()
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def get_column_type_sql(column):
|
| 38 |
-
"""Convert SQLAlchemy column type to SQL type string."""
|
| 39 |
-
from sqlalchemy import Boolean, Integer, String, DateTime, Text, Float, JSON
|
| 40 |
-
|
| 41 |
-
col_type = type(column.type)
|
| 42 |
-
|
| 43 |
-
type_map = {
|
| 44 |
-
Boolean: "BOOLEAN",
|
| 45 |
-
Integer: "INTEGER",
|
| 46 |
-
String: "VARCHAR(255)",
|
| 47 |
-
DateTime: "TIMESTAMP",
|
| 48 |
-
Text: "TEXT",
|
| 49 |
-
Float: "FLOAT",
|
| 50 |
-
JSON: "JSONB" # PostgreSQL JSON type
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
# Check for String with specific length
|
| 54 |
-
if hasattr(column.type, 'length') and column.type.length:
|
| 55 |
-
return f"VARCHAR({column.type.length})"
|
| 56 |
-
|
| 57 |
-
return type_map.get(col_type, "TEXT")
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def get_default_sql(column):
|
| 61 |
-
"""Get SQL DEFAULT clause for a column."""
|
| 62 |
-
if column.default is not None:
|
| 63 |
-
default_val = column.default.arg
|
| 64 |
-
if isinstance(default_val, bool):
|
| 65 |
-
return "DEFAULT FALSE" if not default_val else "DEFAULT TRUE"
|
| 66 |
-
elif isinstance(default_val, (int, float)):
|
| 67 |
-
return f"DEFAULT {default_val}"
|
| 68 |
-
elif isinstance(default_val, str):
|
| 69 |
-
return f"DEFAULT '{default_val}'"
|
| 70 |
-
elif isinstance(default_val, dict):
|
| 71 |
-
return "DEFAULT '{}'"
|
| 72 |
-
return ""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def run_migrations(engine: Engine):
|
| 76 |
-
"""
|
| 77 |
-
Check all models and add any missing columns to the database.
|
| 78 |
-
This runs at application startup.
|
| 79 |
-
"""
|
| 80 |
-
from app.models.user import User, Analysis, Payment
|
| 81 |
-
from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
|
| 82 |
-
|
| 83 |
-
models = [User, Analysis, Payment, PlanFeatureOverride, PlanUploadLimit]
|
| 84 |
-
|
| 85 |
-
for model in models:
|
| 86 |
-
table_name = model.__tablename__
|
| 87 |
-
model_cols = get_model_columns(model)
|
| 88 |
-
db_cols = get_db_columns(engine, table_name)
|
| 89 |
-
|
| 90 |
-
if not db_cols:
|
| 91 |
-
# Table doesn't exist yet, let create_all handle it
|
| 92 |
-
logger.info(f"Table '{table_name}' not found, will be created by create_all()")
|
| 93 |
-
continue
|
| 94 |
-
|
| 95 |
-
missing_cols = set(model_cols.keys()) - db_cols
|
| 96 |
-
|
| 97 |
-
for col_name in missing_cols:
|
| 98 |
-
col = model_cols[col_name]
|
| 99 |
-
col_type = get_column_type_sql(col)
|
| 100 |
-
default_clause = get_default_sql(col)
|
| 101 |
-
|
| 102 |
-
sql = f'ALTER TABLE {table_name} ADD COLUMN {col_name} {col_type} {default_clause}'
|
| 103 |
-
|
| 104 |
-
try:
|
| 105 |
-
with engine.connect() as conn:
|
| 106 |
-
conn.execute(text(sql))
|
| 107 |
-
conn.commit()
|
| 108 |
-
logger.info(f"✓ Added column '{col_name}' to table '{table_name}'")
|
| 109 |
-
except Exception as e:
|
| 110 |
-
# Column might already exist or other issue
|
| 111 |
-
logger.warning(f"Could not add column '{col_name}' to '{table_name}': {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/plan_config.py
DELETED
|
@@ -1,192 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Plan Configuration - Default settings for each subscription plan.
|
| 3 |
-
|
| 4 |
-
This module defines upload limits and default feature access per plan.
|
| 5 |
-
Admins can override these defaults via the admin console.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from typing import Dict, List, Any
|
| 9 |
-
from .feature_registry import get_all_feature_ids
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# =============================================================================
|
| 13 |
-
# PLAN CONFIGURATION
|
| 14 |
-
# =============================================================================
|
| 15 |
-
# Each plan has:
|
| 16 |
-
# - upload_limit: Monthly upload cap
|
| 17 |
-
# - is_session: True for guest/anonymous (session-based tracking)
|
| 18 |
-
# - features: List of enabled feature IDs, or ["*"] for all features
|
| 19 |
-
# =============================================================================
|
| 20 |
-
|
| 21 |
-
PLAN_DEFAULTS: Dict[str, Dict[str, Any]] = {
|
| 22 |
-
# Guest users on /try page (session-based, no account)
|
| 23 |
-
"Guest": {
|
| 24 |
-
"upload_limit": 2,
|
| 25 |
-
"is_session": True,
|
| 26 |
-
"features": [
|
| 27 |
-
"kpi_margins",
|
| 28 |
-
"health_score",
|
| 29 |
-
"risk_score",
|
| 30 |
-
"pdf_export"
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
-
|
| 34 |
-
# Free trial - full Small Business experience for 1 month
|
| 35 |
-
"Free Trial": {
|
| 36 |
-
"upload_limit": 15,
|
| 37 |
-
"is_session": False,
|
| 38 |
-
"features": [
|
| 39 |
-
"kpi_margins",
|
| 40 |
-
"kpi_ratios",
|
| 41 |
-
"health_score",
|
| 42 |
-
"risk_score",
|
| 43 |
-
"risk_factors",
|
| 44 |
-
"runway_forecast",
|
| 45 |
-
"burn_rate",
|
| 46 |
-
"interactive_charts",
|
| 47 |
-
"pdf_export"
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
|
| 51 |
-
# Individual plan - $9/month
|
| 52 |
-
"Individual": {
|
| 53 |
-
"upload_limit": 5,
|
| 54 |
-
"is_session": False,
|
| 55 |
-
"features": [
|
| 56 |
-
"kpi_margins",
|
| 57 |
-
"kpi_ratios",
|
| 58 |
-
"health_score",
|
| 59 |
-
"risk_score",
|
| 60 |
-
"risk_factors",
|
| 61 |
-
"pdf_export"
|
| 62 |
-
]
|
| 63 |
-
},
|
| 64 |
-
|
| 65 |
-
# Organization plan - $49/month
|
| 66 |
-
"Organization": {
|
| 67 |
-
"upload_limit": 10,
|
| 68 |
-
"is_session": False,
|
| 69 |
-
"features": [
|
| 70 |
-
"kpi_margins",
|
| 71 |
-
"kpi_ratios",
|
| 72 |
-
"health_score",
|
| 73 |
-
"risk_score",
|
| 74 |
-
"risk_factors",
|
| 75 |
-
"liquidity_risk",
|
| 76 |
-
"runway_forecast",
|
| 77 |
-
"ai_summary",
|
| 78 |
-
"interactive_charts",
|
| 79 |
-
"pdf_export"
|
| 80 |
-
]
|
| 81 |
-
},
|
| 82 |
-
|
| 83 |
-
# Small Business plan - $99/month
|
| 84 |
-
"Small Business": {
|
| 85 |
-
"upload_limit": 15,
|
| 86 |
-
"is_session": False,
|
| 87 |
-
"features": ["*"] # All features
|
| 88 |
-
},
|
| 89 |
-
|
| 90 |
-
# Mid Business plan - $249/month
|
| 91 |
-
"Mid Business": {
|
| 92 |
-
"upload_limit": 25,
|
| 93 |
-
"is_session": False,
|
| 94 |
-
"features": ["*"] # All features
|
| 95 |
-
},
|
| 96 |
-
|
| 97 |
-
# Large Business / Enterprise - $499+/month
|
| 98 |
-
"Large Business": {
|
| 99 |
-
"upload_limit": 50,
|
| 100 |
-
"is_session": False,
|
| 101 |
-
"features": ["*"] # All features
|
| 102 |
-
},
|
| 103 |
-
|
| 104 |
-
# Admin users - unlimited access
|
| 105 |
-
"Admin": {
|
| 106 |
-
"upload_limit": 999999,
|
| 107 |
-
"is_session": False,
|
| 108 |
-
"features": ["*"]
|
| 109 |
-
},
|
| 110 |
-
|
| 111 |
-
# Engine Configs (Treated as Plans for feature flags)
|
| 112 |
-
"_ENGINE_v1": {
|
| 113 |
-
"upload_limit": 0,
|
| 114 |
-
"is_session": False,
|
| 115 |
-
"features": ["*"]
|
| 116 |
-
},
|
| 117 |
-
"_ENGINE_v2": {
|
| 118 |
-
"upload_limit": 0,
|
| 119 |
-
"is_session": False,
|
| 120 |
-
"features": [
|
| 121 |
-
"kpi_margins", "kpi_ratios", "health_score", "risk_score", "risk_factors",
|
| 122 |
-
"runway_forecast", "burn_rate", "interactive_charts", "pdf_export",
|
| 123 |
-
"ai_summary", "intelligence_card"
|
| 124 |
-
# Note: Geo Insights and AI CFO omitted by default for Lite Engine
|
| 125 |
-
]
|
| 126 |
-
}
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
# Special "Plan" names for Engine Feature Configuration
|
| 132 |
-
ENGINE_PLANS = ["_ENGINE_v1", "_ENGINE_v2"]
|
| 133 |
-
|
| 134 |
-
# Mappings for UI display
|
| 135 |
-
ENGINE_DISPLAY_NAMES = {
|
| 136 |
-
"_ENGINE_v1": "Visi-Insight-1 (Standard)",
|
| 137 |
-
"_ENGINE_v2": "Visi-Insight-2 (Lite)"
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# =============================================================================
|
| 142 |
-
# HELPER FUNCTIONS
|
| 143 |
-
# =============================================================================
|
| 144 |
-
|
| 145 |
-
def get_plan_config(plan_name: str) -> Dict[str, Any]:
|
| 146 |
-
"""
|
| 147 |
-
Get configuration for a specific plan.
|
| 148 |
-
Falls back to Individual if plan not found.
|
| 149 |
-
"""
|
| 150 |
-
return PLAN_DEFAULTS.get(plan_name, PLAN_DEFAULTS["Individual"])
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
def get_upload_limit(plan_name: str) -> int:
|
| 154 |
-
"""Get the monthly upload limit for a plan."""
|
| 155 |
-
config = get_plan_config(plan_name)
|
| 156 |
-
return config.get("upload_limit", 5)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
def get_default_features(plan_name: str) -> List[str]:
|
| 160 |
-
"""
|
| 161 |
-
Get list of enabled feature IDs for a plan.
|
| 162 |
-
Expands ["*"] to all feature IDs.
|
| 163 |
-
"""
|
| 164 |
-
config = get_plan_config(plan_name)
|
| 165 |
-
features = config.get("features", [])
|
| 166 |
-
|
| 167 |
-
if "*" in features:
|
| 168 |
-
return get_all_feature_ids()
|
| 169 |
-
|
| 170 |
-
return features
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
def is_session_based(plan_name: str) -> bool:
|
| 174 |
-
"""Check if plan uses session-based tracking (for guests)."""
|
| 175 |
-
config = get_plan_config(plan_name)
|
| 176 |
-
return config.get("is_session", False)
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
def get_all_plans() -> List[str]:
|
| 180 |
-
"""Returns list of all plan names."""
|
| 181 |
-
return list(PLAN_DEFAULTS.keys())
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
def get_billable_plans() -> List[str]:
|
| 185 |
-
"""Returns plans that are actual subscription tiers (excludes Guest/Admin)."""
|
| 186 |
-
return [p for p in PLAN_DEFAULTS.keys() if p not in ("Guest", "Admin")]
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
def get_all_engines() -> List[str]:
|
| 190 |
-
"""Returns list of engine identifier keys."""
|
| 191 |
-
return ENGINE_PLANS
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/security.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
from datetime import datetime, timedelta
|
| 2 |
-
from typing import Optional
|
| 3 |
-
from jose import JWTError, jwt
|
| 4 |
-
from passlib.context import CryptContext
|
| 5 |
-
from app.core.config import settings
|
| 6 |
-
|
| 7 |
-
# Config
|
| 8 |
-
SECRET_KEY = settings.SECRET_KEY
|
| 9 |
-
ALGORITHM = settings.ALGORITHM
|
| 10 |
-
ACCESS_TOKEN_EXPIRE_MINUTES = settings.ACCESS_TOKEN_EXPIRE_MINUTES
|
| 11 |
-
|
| 12 |
-
pwd_context = CryptContext(schemes=["argon2"], deprecated="auto")
|
| 13 |
-
|
| 14 |
-
def verify_password(plain_password, hashed_password):
|
| 15 |
-
return pwd_context.verify(plain_password, hashed_password)
|
| 16 |
-
|
| 17 |
-
def get_password_hash(password):
|
| 18 |
-
return pwd_context.hash(password)
|
| 19 |
-
|
| 20 |
-
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
|
| 21 |
-
to_encode = data.copy()
|
| 22 |
-
if expires_delta:
|
| 23 |
-
expire = datetime.utcnow() + expires_delta
|
| 24 |
-
else:
|
| 25 |
-
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 26 |
-
to_encode.update({"exp": expire})
|
| 27 |
-
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 28 |
-
return encoded_jwt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/core/stripe_config.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
import stripe
|
| 2 |
-
from app.core.config import settings
|
| 3 |
-
|
| 4 |
-
stripe.api_key = settings.STRIPE_SECRET_KEY
|
| 5 |
-
|
| 6 |
-
def create_checkout_session(db_user, plan_id: str):
|
| 7 |
-
try:
|
| 8 |
-
checkout_session = stripe.checkout.Session.create(
|
| 9 |
-
customer_email=db_user.email,
|
| 10 |
-
client_reference_id=str(db_user.id),
|
| 11 |
-
payment_method_types=['card'],
|
| 12 |
-
line_items=[
|
| 13 |
-
{
|
| 14 |
-
'price': plan_id,
|
| 15 |
-
'quantity': 1,
|
| 16 |
-
},
|
| 17 |
-
],
|
| 18 |
-
mode='subscription',
|
| 19 |
-
success_url=f"{settings.ALLOWED_ORIGINS[0]}/dashboard?session_id={{CHECKOUT_SESSION_ID}}",
|
| 20 |
-
cancel_url=f"{settings.ALLOWED_ORIGINS[0]}/pricing",
|
| 21 |
-
metadata={
|
| 22 |
-
'user_id': db_user.id,
|
| 23 |
-
'plan_name': 'Business' # Or derive from plan_id
|
| 24 |
-
}
|
| 25 |
-
)
|
| 26 |
-
return checkout_session
|
| 27 |
-
except Exception as e:
|
| 28 |
-
print(f"Stripe Error: {e}")
|
| 29 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/main.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
|
| 4 |
-
app = FastAPI(
|
| 5 |
-
title="Visique API",
|
| 6 |
-
description="Financial Data Analyzer Backend",
|
| 7 |
-
version="0.1.0"
|
| 8 |
-
)
|
| 9 |
-
|
| 10 |
-
from app.core.config import settings
|
| 11 |
-
|
| 12 |
-
# CORS Configuration
|
| 13 |
-
# Ensure Vercel domains are allowed even if env vars override config defaults
|
| 14 |
-
origins = []
|
| 15 |
-
if isinstance(settings.ALLOWED_ORIGINS, list):
|
| 16 |
-
origins.extend(settings.ALLOWED_ORIGINS)
|
| 17 |
-
else:
|
| 18 |
-
origins.append(str(settings.ALLOWED_ORIGINS))
|
| 19 |
-
|
| 20 |
-
extra_origins = [
|
| 21 |
-
"https://visique-testing.vercel.app",
|
| 22 |
-
"https://visique-frontend.vercel.app",
|
| 23 |
-
# Specific current previews
|
| 24 |
-
"https://visique-testing-7qdi0vaqf-sams-projects-85f65c65.vercel.app",
|
| 25 |
-
"https://visique-testing-fky1isli2-sams-projects-85f65c65.vercel.app"
|
| 26 |
-
]
|
| 27 |
-
|
| 28 |
-
for origin in extra_origins:
|
| 29 |
-
if origin not in origins:
|
| 30 |
-
origins.append(origin)
|
| 31 |
-
|
| 32 |
-
app.add_middleware(
|
| 33 |
-
CORSMiddleware,
|
| 34 |
-
allow_origins=origins,
|
| 35 |
-
# Allow any Vercel preview domain for this specific project
|
| 36 |
-
allow_origin_regex=r"https://visique-testing-.*-sams-projects-85f65c65\.vercel\.app",
|
| 37 |
-
allow_credentials=True,
|
| 38 |
-
allow_methods=["*"],
|
| 39 |
-
allow_headers=["*"],
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
@app.get("/")
|
| 43 |
-
async def root():
|
| 44 |
-
return {"message": "Welcome to Visique Financial Analyzer API"}
|
| 45 |
-
|
| 46 |
-
@app.get("/health")
|
| 47 |
-
async def health_check():
|
| 48 |
-
return {"status": "healthy"}
|
| 49 |
-
|
| 50 |
-
from app.api.endpoints import router as analysis_router
|
| 51 |
-
from app.api.auth import router as auth_router
|
| 52 |
-
from app.core.database import engine, Base
|
| 53 |
-
|
| 54 |
-
# Run Automatic Schema Migrations (adds missing columns)
|
| 55 |
-
from app.core.migrations import run_migrations
|
| 56 |
-
run_migrations(engine)
|
| 57 |
-
|
| 58 |
-
# Create Tables (for new tables only, migrations handles columns)
|
| 59 |
-
Base.metadata.create_all(bind=engine)
|
| 60 |
-
|
| 61 |
-
app.include_router(analysis_router, prefix="/api/v1")
|
| 62 |
-
app.include_router(auth_router, prefix="/api/v1")
|
| 63 |
-
|
| 64 |
-
from app.api.admin import router as admin_router
|
| 65 |
-
app.include_router(admin_router, prefix="/api/v1")
|
| 66 |
-
|
| 67 |
-
# Mount Static Files for Uploads
|
| 68 |
-
from fastapi.staticfiles import StaticFiles
|
| 69 |
-
import os
|
| 70 |
-
|
| 71 |
-
# Ensure upload directory exists
|
| 72 |
-
upload_dir = "uploads"
|
| 73 |
-
if not os.path.exists(upload_dir):
|
| 74 |
-
os.makedirs(upload_dir)
|
| 75 |
-
|
| 76 |
-
# Mount /api/v1/static to the uploads directory
|
| 77 |
-
app.mount("/api/v1/static", StaticFiles(directory="uploads"), name="static")
|
| 78 |
-
|
| 79 |
-
from sqlalchemy import text
|
| 80 |
-
from app.core.database import SessionLocal
|
| 81 |
-
|
| 82 |
-
# Startup Migration for V2 Engine Support
|
| 83 |
-
@app.on_event("startup")
|
| 84 |
-
def run_migrations():
|
| 85 |
-
try:
|
| 86 |
-
db = SessionLocal()
|
| 87 |
-
# Add preferred_engine column if it doesn't exist
|
| 88 |
-
db.execute(text("ALTER TABLE users ADD COLUMN IF NOT EXISTS preferred_engine VARCHAR DEFAULT 'v1'"))
|
| 89 |
-
db.commit()
|
| 90 |
-
db.close()
|
| 91 |
-
print("Startup Migration: Verified preferred_engine column.")
|
| 92 |
-
except Exception as e:
|
| 93 |
-
print(f"Startup Migration Warning: {e}")
|
| 94 |
-
|
| 95 |
-
# Keep-Alive Background Task to prevent Render free tier from sleeping
|
| 96 |
-
import asyncio
|
| 97 |
-
import httpx
|
| 98 |
-
|
| 99 |
-
async def keep_alive_task():
|
| 100 |
-
"""Pings the health endpoint every 5 minutes to prevent cold starts."""
|
| 101 |
-
# Wait for initial startup to complete
|
| 102 |
-
await asyncio.sleep(60)
|
| 103 |
-
|
| 104 |
-
# Get the app URL from environment or use default
|
| 105 |
-
app_url = os.environ.get("RENDER_EXTERNAL_URL", "https://visique-backend.onrender.com")
|
| 106 |
-
health_url = f"{app_url}/health"
|
| 107 |
-
|
| 108 |
-
print(f"[Keep-Alive] Started. Pinging {health_url} every 5 minutes.")
|
| 109 |
-
|
| 110 |
-
async with httpx.AsyncClient() as client:
|
| 111 |
-
while True:
|
| 112 |
-
try:
|
| 113 |
-
response = await client.get(health_url, timeout=30)
|
| 114 |
-
print(f"[Keep-Alive] Ping successful: {response.status_code}")
|
| 115 |
-
except Exception as e:
|
| 116 |
-
print(f"[Keep-Alive] Ping failed: {e}")
|
| 117 |
-
|
| 118 |
-
# Wait 5 minutes (300 seconds) before next ping
|
| 119 |
-
await asyncio.sleep(300)
|
| 120 |
-
|
| 121 |
-
@app.on_event("startup")
|
| 122 |
-
async def start_keep_alive():
|
| 123 |
-
"""Starts the keep-alive background task on app startup."""
|
| 124 |
-
asyncio.create_task(keep_alive_task())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/models/feature_flags.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Feature Flags Model - Admin-managed feature overrides per plan.
|
| 3 |
-
|
| 4 |
-
This model stores per-plan feature overrides that take precedence
|
| 5 |
-
over the defaults defined in plan_config.py.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
|
| 9 |
-
from sqlalchemy.orm import relationship
|
| 10 |
-
from datetime import datetime
|
| 11 |
-
from app.core.database import Base
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class PlanFeatureOverride(Base):
|
| 15 |
-
"""
|
| 16 |
-
Stores admin overrides for feature availability per plan.
|
| 17 |
-
|
| 18 |
-
When checking if a feature is enabled for a plan:
|
| 19 |
-
1. Check if override exists in this table
|
| 20 |
-
2. If yes, use the override value
|
| 21 |
-
3. If no, fall back to plan_config.py defaults
|
| 22 |
-
"""
|
| 23 |
-
__tablename__ = "plan_feature_overrides"
|
| 24 |
-
|
| 25 |
-
id = Column(Integer, primary_key=True, index=True)
|
| 26 |
-
plan_name = Column(String, index=True, nullable=False)
|
| 27 |
-
feature_id = Column(String, index=True, nullable=False)
|
| 28 |
-
enabled = Column(Boolean, default=True, nullable=False)
|
| 29 |
-
|
| 30 |
-
# Audit fields
|
| 31 |
-
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 32 |
-
updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
|
| 33 |
-
|
| 34 |
-
def __repr__(self):
|
| 35 |
-
status = "enabled" if self.enabled else "disabled"
|
| 36 |
-
return f"<PlanFeatureOverride {self.plan_name}:{self.feature_id}={status}>"
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
class PlanUploadLimit(Base):
|
| 40 |
-
"""
|
| 41 |
-
Stores admin overrides for upload limits per plan.
|
| 42 |
-
|
| 43 |
-
When checking upload limit for a plan:
|
| 44 |
-
1. Check if override exists in this table
|
| 45 |
-
2. If yes, use the override value
|
| 46 |
-
3. If no, fall back to plan_config.py defaults
|
| 47 |
-
"""
|
| 48 |
-
__tablename__ = "plan_upload_limits"
|
| 49 |
-
|
| 50 |
-
id = Column(Integer, primary_key=True, index=True)
|
| 51 |
-
plan_name = Column(String, unique=True, index=True, nullable=False)
|
| 52 |
-
upload_limit = Column(Integer, nullable=False)
|
| 53 |
-
|
| 54 |
-
# Audit fields
|
| 55 |
-
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 56 |
-
updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
|
| 57 |
-
|
| 58 |
-
def __repr__(self):
|
| 59 |
-
return f"<PlanUploadLimit {self.plan_name}={self.upload_limit}>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/models/user.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
import sqlalchemy
|
| 2 |
-
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text, Boolean
|
| 3 |
-
from sqlalchemy.orm import relationship
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
from app.core.database import Base
|
| 6 |
-
|
| 7 |
-
class User(Base):
|
| 8 |
-
__tablename__ = "users"
|
| 9 |
-
|
| 10 |
-
id = Column(Integer, primary_key=True, index=True)
|
| 11 |
-
email = Column(String, unique=True, index=True)
|
| 12 |
-
hashed_password = Column(String)
|
| 13 |
-
full_name = Column(String, nullable=True)
|
| 14 |
-
company_name = Column(String, nullable=True)
|
| 15 |
-
plan = Column(String, default="Free")
|
| 16 |
-
plan_expires_at = Column(DateTime, nullable=True)
|
| 17 |
-
is_admin = Column(Boolean, default=False)
|
| 18 |
-
is_super_admin = Column(Boolean, default=False)
|
| 19 |
-
created_at = Column(DateTime, default=datetime.utcnow)
|
| 20 |
-
|
| 21 |
-
# New Fields for Verification & Profile
|
| 22 |
-
visique_id = Column(String, unique=True, index=True, nullable=True) # Generated VSQ-XXXX
|
| 23 |
-
ein = Column(String, nullable=True)
|
| 24 |
-
address = Column(String, nullable=True)
|
| 25 |
-
profile_picture_url = Column(String, nullable=True)
|
| 26 |
-
industry = Column(String, default="General")
|
| 27 |
-
preferred_engine = Column(String, default="v1") # "v1" (Standard) or "v2" (Lite)
|
| 28 |
-
|
| 29 |
-
# Upload Tracking
|
| 30 |
-
monthly_upload_count = Column(Integer, default=0)
|
| 31 |
-
upload_reset_date = Column(DateTime, default=datetime.utcnow)
|
| 32 |
-
|
| 33 |
-
# Custom User-Level Feature Overrides (Add-ons)
|
| 34 |
-
custom_features = Column(sqlalchemy.JSON, default={}) # Stores { feature_id: bool }
|
| 35 |
-
|
| 36 |
-
analyses = relationship("Analysis", back_populates="owner", cascade="all, delete-orphan")
|
| 37 |
-
payments = relationship("Payment", back_populates="user", cascade="all, delete-orphan")
|
| 38 |
-
|
| 39 |
-
class Analysis(Base):
|
| 40 |
-
__tablename__ = "analyses"
|
| 41 |
-
|
| 42 |
-
id = Column(Integer, primary_key=True, index=True)
|
| 43 |
-
user_id = Column(Integer, ForeignKey("users.id"))
|
| 44 |
-
timestamp = Column(DateTime, default=datetime.utcnow)
|
| 45 |
-
company_name = Column(String)
|
| 46 |
-
input_filename = Column(String)
|
| 47 |
-
stored_filename = Column(String) # Path to saved file on disk
|
| 48 |
-
result_json = Column(Text)
|
| 49 |
-
|
| 50 |
-
owner = relationship("User", back_populates="analyses")
|
| 51 |
-
|
| 52 |
-
class Payment(Base):
|
| 53 |
-
__tablename__ = "payments"
|
| 54 |
-
|
| 55 |
-
id = Column(Integer, primary_key=True, index=True)
|
| 56 |
-
user_id = Column(Integer, ForeignKey("users.id"))
|
| 57 |
-
amount = Column(Integer) # In cents or dollars? Let's assume dollars as float or integer cents. Implementation plan said float, but explicit Integer is safer for cents. Let's stick to Float for simplicity with display, or String. Plan said 'amount (float)'. Let's use Float.
|
| 58 |
-
status = Column(String) # paid, pending, overdue
|
| 59 |
-
date = Column(DateTime, default=datetime.utcnow)
|
| 60 |
-
plan_name = Column(String)
|
| 61 |
-
invoice_pdf = Column(String, nullable=True) # Path to invoice file
|
| 62 |
-
|
| 63 |
-
user = relationship("User", back_populates="payments")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/schemas/chat.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from pydantic import BaseModel
|
| 2 |
-
from typing import List, Optional
|
| 3 |
-
|
| 4 |
-
class Message(BaseModel):
|
| 5 |
-
role: str # "user" or "assistant"
|
| 6 |
-
content: str
|
| 7 |
-
|
| 8 |
-
class ChatRequest(BaseModel):
|
| 9 |
-
messages: List[Message]
|
| 10 |
-
context_filter: Optional[str] = None # e.g. "Balance Sheet", "Risk Report"
|
| 11 |
-
|
| 12 |
-
class ChatResponse(BaseModel):
|
| 13 |
-
response: str
|
| 14 |
-
sources: List[str] = [] # Citations or references to specific data points
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/schemas/financial.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
|
| 4 |
-
# Dynamic Path Resolution for 'financial_model' library
|
| 5 |
-
# Structure: root/visique/backend/app/schemas/financial.py -> root/financial_model
|
| 6 |
-
# To import 'financial_model' as a package, we need to add 'root' to sys.path
|
| 7 |
-
try:
|
| 8 |
-
current_file = Path(__file__).resolve()
|
| 9 |
-
# Go up 4 levels to 'visique' (backend/app/schemas/financial.py -> schemas -> app -> backend -> visique)
|
| 10 |
-
# Then up 1 more to root?
|
| 11 |
-
# current_file.parents[0] = schemas
|
| 12 |
-
# current_file.parents[1] = app
|
| 13 |
-
# current_file.parents[2] = backend
|
| 14 |
-
# current_file.parents[3] = visique
|
| 15 |
-
# current_file.parents[4] = root (TestAntigrav)
|
| 16 |
-
|
| 17 |
-
project_root = current_file.parents[4]
|
| 18 |
-
|
| 19 |
-
# Check if 'financial_model' exists in this root
|
| 20 |
-
if (project_root / "financial_model").exists():
|
| 21 |
-
if str(project_root) not in sys.path:
|
| 22 |
-
sys.path.insert(0, str(project_root))
|
| 23 |
-
else:
|
| 24 |
-
# Fallback for different execution contexts
|
| 25 |
-
cwd = Path.cwd()
|
| 26 |
-
if (cwd / "financial_model").exists():
|
| 27 |
-
if str(cwd) not in sys.path: sys.path.insert(0, str(cwd))
|
| 28 |
-
elif (cwd.parent.parent / "financial_model").exists():
|
| 29 |
-
unique_root = str(cwd.parent.parent)
|
| 30 |
-
if unique_root not in sys.path: sys.path.insert(0, unique_root)
|
| 31 |
-
|
| 32 |
-
except Exception as e:
|
| 33 |
-
pass # Handle gracefully
|
| 34 |
-
|
| 35 |
-
try:
|
| 36 |
-
# Now import from the PACKAGE "financial_model"
|
| 37 |
-
from financial_model.models import (
|
| 38 |
-
PeriodType, Currency,
|
| 39 |
-
IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, OperatingMetrics,
|
| 40 |
-
DocumentClassification,
|
| 41 |
-
FinancialReport, KPIMetrics, BudgetModel, VarianceAnalysis, RiskAnalysis,
|
| 42 |
-
HealthScoreBreakdown, GeoAnalysis, RunwayForecast, OptimizationInsight,
|
| 43 |
-
StandardizedDataPackage
|
| 44 |
-
)
|
| 45 |
-
except ImportError:
|
| 46 |
-
print("WARNING: Could not import from financial_model library. Ensure project root is in PYTHONPATH.")
|
| 47 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/schemas/user.py
DELETED
|
@@ -1,82 +0,0 @@
|
|
| 1 |
-
from pydantic import BaseModel, EmailStr
|
| 2 |
-
from typing import Optional, List
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
|
| 5 |
-
class UserBase(BaseModel):
|
| 6 |
-
email: str # was EmailStr
|
| 7 |
-
|
| 8 |
-
class UserCreate(UserBase):
|
| 9 |
-
password: str
|
| 10 |
-
full_name: Optional[str] = None
|
| 11 |
-
company_name: Optional[str] = None
|
| 12 |
-
admin_key: Optional[str] = None
|
| 13 |
-
|
| 14 |
-
class UserLogin(UserBase):
|
| 15 |
-
password: str
|
| 16 |
-
|
| 17 |
-
class UserResponse(UserBase):
|
| 18 |
-
id: int
|
| 19 |
-
full_name: Optional[str] = None
|
| 20 |
-
company_name: Optional[str] = None
|
| 21 |
-
plan: str = "Free"
|
| 22 |
-
plan_expires_at: Optional[datetime] = None
|
| 23 |
-
is_admin: bool = False
|
| 24 |
-
is_super_admin: bool = False
|
| 25 |
-
created_at: datetime
|
| 26 |
-
|
| 27 |
-
# New Fields
|
| 28 |
-
visique_id: Optional[str] = None
|
| 29 |
-
ein: Optional[str] = None
|
| 30 |
-
address: Optional[str] = None
|
| 31 |
-
profile_picture_url: Optional[str] = None
|
| 32 |
-
industry: Optional[str] = None
|
| 33 |
-
preferred_engine: Optional[str] = "v1"
|
| 34 |
-
custom_features: Optional[dict] = None # JSON feature overrides
|
| 35 |
-
|
| 36 |
-
class Config:
|
| 37 |
-
from_attributes = True
|
| 38 |
-
|
| 39 |
-
class Token(BaseModel):
|
| 40 |
-
access_token: str
|
| 41 |
-
token_type: str
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class TokenData(BaseModel):
|
| 46 |
-
email: Optional[str] = None
|
| 47 |
-
|
| 48 |
-
class AnalysisBase(BaseModel):
|
| 49 |
-
company_name: str
|
| 50 |
-
input_filename: str
|
| 51 |
-
timestamp: datetime
|
| 52 |
-
# result_json is heavy, maybe separate detail view
|
| 53 |
-
|
| 54 |
-
class AnalysisResponse(AnalysisBase):
|
| 55 |
-
id: int
|
| 56 |
-
user_id: int
|
| 57 |
-
|
| 58 |
-
class Config:
|
| 59 |
-
from_attributes = True
|
| 60 |
-
|
| 61 |
-
class UpgradeRequest(BaseModel):
|
| 62 |
-
plan_name: str
|
| 63 |
-
amount: float = 0.0
|
| 64 |
-
card_number: str
|
| 65 |
-
expiry: str
|
| 66 |
-
cvv: str
|
| 67 |
-
# New Checkout Fields
|
| 68 |
-
address: Optional[str] = None
|
| 69 |
-
ein: Optional[str] = None
|
| 70 |
-
|
| 71 |
-
class PaymentBase(BaseModel):
|
| 72 |
-
amount: float
|
| 73 |
-
status: str
|
| 74 |
-
plan_name: str
|
| 75 |
-
date: datetime
|
| 76 |
-
|
| 77 |
-
class PaymentResponse(PaymentBase):
|
| 78 |
-
id: int
|
| 79 |
-
invoice_pdf: Optional[str] = None
|
| 80 |
-
|
| 81 |
-
class Config:
|
| 82 |
-
from_attributes = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/__init__.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Services Layer
|
| 3 |
-
|
| 4 |
-
This package contains all business logic for the Visique platform.
|
| 5 |
-
|
| 6 |
-
## Module Index
|
| 7 |
-
|
| 8 |
-
- `feature_service` - Feature flag resolution and plan management
|
| 9 |
-
- `analysis/` - Financial analysis and calculations
|
| 10 |
-
- `ingestion/` - Data parsing (CSV, PDF)
|
| 11 |
-
- `intelligence/` - AI-powered features (Gemini, RAG)
|
| 12 |
-
- `reporting/` - Report generation (PDF, PPTX)
|
| 13 |
-
|
| 14 |
-
## Usage Pattern
|
| 15 |
-
|
| 16 |
-
```python
|
| 17 |
-
from app.services.feature_service import get_effective_features, check_upload_limit
|
| 18 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
| 19 |
-
from app.services.intelligence.gemini_service import GeminiService
|
| 20 |
-
```
|
| 21 |
-
|
| 22 |
-
## Design Principles
|
| 23 |
-
|
| 24 |
-
1. **Stateless**: Services don't hold state between calls
|
| 25 |
-
2. **Testable**: All dependencies injected as parameters
|
| 26 |
-
3. **Single Purpose**: Each module handles one domain
|
| 27 |
-
4. **Error Handling**: Raise specific exceptions, don't swallow errors
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
# Re-export commonly used functions for convenience
|
| 31 |
-
# NOTE: Commented out for AI Worker context to avoid heavy dependencies (SQLAlchemy)
|
| 32 |
-
# from app.services.feature_service import (
|
| 33 |
-
# get_effective_features,
|
| 34 |
-
# check_upload_limit,
|
| 35 |
-
# increment_upload_count,
|
| 36 |
-
# get_effective_upload_limit,
|
| 37 |
-
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/__init__.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Financial Analysis Services
|
| 3 |
-
|
| 4 |
-
This package contains the core financial analysis logic.
|
| 5 |
-
|
| 6 |
-
## Module Responsibilities
|
| 7 |
-
|
| 8 |
-
| Module | Purpose | Key Functions |
|
| 9 |
-
|--------|---------|---------------|
|
| 10 |
-
| `fundamental.py` | Main orchestrator | `FundamentalAnalyzer.analyze()` |
|
| 11 |
-
| `kpi.py` | KPI calculations | `calculate_margins()`, `calculate_ratios()` |
|
| 12 |
-
| `risk.py` | Risk assessment | `calculate_risk_score()`, `identify_risk_factors()` |
|
| 13 |
-
| `health_score.py` | Overall health | `compute_health_score()` |
|
| 14 |
-
| `growth.py` | Growth metrics | `calculate_growth_rates()` |
|
| 15 |
-
| `simulation.py` | What-if modeling | `simulate_scenario()` |
|
| 16 |
-
|
| 17 |
-
## Data Flow
|
| 18 |
-
|
| 19 |
-
```
|
| 20 |
-
Raw Data (CSV/PDF)
|
| 21 |
-
↓
|
| 22 |
-
Ingestion Layer (parsed dict)
|
| 23 |
-
↓
|
| 24 |
-
FundamentalAnalyzer.analyze()
|
| 25 |
-
├── KPI Calculator
|
| 26 |
-
├── Risk Analyzer
|
| 27 |
-
├── Health Score
|
| 28 |
-
├── Growth Metrics
|
| 29 |
-
└── (optional) AI Enrichment
|
| 30 |
-
↓
|
| 31 |
-
StandardizedDataPackage
|
| 32 |
-
```
|
| 33 |
-
|
| 34 |
-
## Usage
|
| 35 |
-
|
| 36 |
-
```python
|
| 37 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
| 38 |
-
|
| 39 |
-
analyzer = FundamentalAnalyzer()
|
| 40 |
-
result = await analyzer.analyze(parsed_data, user, filename)
|
| 41 |
-
# result is a StandardizedDataPackage (Pydantic model)
|
| 42 |
-
```
|
| 43 |
-
|
| 44 |
-
## Adding New Analysis Modules
|
| 45 |
-
|
| 46 |
-
1. Create new file in this directory (e.g., `budget.py`)
|
| 47 |
-
2. Define calculation functions with type hints
|
| 48 |
-
3. Import and call from `FundamentalAnalyzer.analyze()`
|
| 49 |
-
4. Add result to `StandardizedDataPackage` schema
|
| 50 |
-
5. (Optional) Register as feature in `feature_registry.py`
|
| 51 |
-
"""
|
| 52 |
-
|
| 53 |
-
# Re-export main analyzer for convenience
|
| 54 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/engine_lite.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import List, Optional, Dict, Any
|
| 4 |
-
from app.schemas.financial import FinancialReport, BudgetModel
|
| 5 |
-
|
| 6 |
-
# Ensure path to financial_model
|
| 7 |
-
try:
|
| 8 |
-
current_file = Path(__file__).resolve()
|
| 9 |
-
project_root = current_file.parents[4]
|
| 10 |
-
if (project_root / "financial_model").exists():
|
| 11 |
-
if str(project_root) not in sys.path:
|
| 12 |
-
sys.path.insert(0, str(project_root))
|
| 13 |
-
except Exception:
|
| 14 |
-
pass
|
| 15 |
-
|
| 16 |
-
try:
|
| 17 |
-
from financial_model.core import FinancialAnalyzer
|
| 18 |
-
except ImportError:
|
| 19 |
-
# Fallback
|
| 20 |
-
sys.path.insert(0, "../../../../../")
|
| 21 |
-
from financial_model.core import FinancialAnalyzer
|
| 22 |
-
|
| 23 |
-
class LiteAnalyzer:
|
| 24 |
-
"""
|
| 25 |
-
Visi-Insight-2 (Lite Engine)
|
| 26 |
-
Optimized for memory-constrained environments.
|
| 27 |
-
- No External API calls (GeoService removed)
|
| 28 |
-
- No Heavy Simulation (if added in future)
|
| 29 |
-
- Pure Mathematical Analysis only
|
| 30 |
-
"""
|
| 31 |
-
@staticmethod
|
| 32 |
-
def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# Run Pure Math Analysis
|
| 36 |
-
analyzer = FinancialAnalyzer(report)
|
| 37 |
-
results = analyzer.run_full_analysis(budget, comparisons, user_address)
|
| 38 |
-
|
| 39 |
-
# Tag result as Lite
|
| 40 |
-
results['meta'] = {
|
| 41 |
-
"engine": "Visi-Insight-2 (Lite)",
|
| 42 |
-
"optimized": True
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
# Explicitly exclude heavy/external modules like GeoService
|
| 46 |
-
results['geo_analysis'] = None
|
| 47 |
-
|
| 48 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/factory.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
from app.models.user import User
|
| 2 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
| 3 |
-
from app.services.analysis.engine_lite import LiteAnalyzer
|
| 4 |
-
|
| 5 |
-
class AnalysisFactory:
|
| 6 |
-
@staticmethod
|
| 7 |
-
def get_analyzer(user: User):
|
| 8 |
-
"""
|
| 9 |
-
Returns the appropriate analyzer class based on user preference.
|
| 10 |
-
Defaults to Standard (V1) if not specified.
|
| 11 |
-
"""
|
| 12 |
-
# Feature Flag / Engine Selection
|
| 13 |
-
engine_pref = getattr(user, 'preferred_engine', 'v1')
|
| 14 |
-
|
| 15 |
-
if engine_pref == 'v2':
|
| 16 |
-
return LiteAnalyzer
|
| 17 |
-
else:
|
| 18 |
-
return FundamentalAnalyzer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/fundamental.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import List, Optional, Dict, Any
|
| 4 |
-
|
| 5 |
-
# Ensure project root is in path so we can import 'financial_model' package
|
| 6 |
-
try:
|
| 7 |
-
current_file = Path(__file__).resolve()
|
| 8 |
-
project_root = current_file.parents[4]
|
| 9 |
-
if (project_root / "financial_model").exists():
|
| 10 |
-
if str(project_root) not in sys.path:
|
| 11 |
-
sys.path.insert(0, str(project_root))
|
| 12 |
-
except Exception:
|
| 13 |
-
pass
|
| 14 |
-
|
| 15 |
-
from app.schemas.financial import (
|
| 16 |
-
FinancialReport,
|
| 17 |
-
BudgetModel,
|
| 18 |
-
StandardizedDataPackage
|
| 19 |
-
)
|
| 20 |
-
# Import Core Logic from Library Package
|
| 21 |
-
try:
|
| 22 |
-
from financial_model.core import FinancialAnalyzer
|
| 23 |
-
except ImportError:
|
| 24 |
-
# If path setup failed, try forcing the path
|
| 25 |
-
sys.path.insert(0, "../../../../../")
|
| 26 |
-
from financial_model.core import FinancialAnalyzer
|
| 27 |
-
|
| 28 |
-
class FundamentalAnalyzer:
|
| 29 |
-
@staticmethod
|
| 30 |
-
def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
|
| 31 |
-
|
| 32 |
-
"""
|
| 33 |
-
Main entry point for analysis.
|
| 34 |
-
Delegates core logic to the independent 'financial_model' library.
|
| 35 |
-
Enhances result with external services (GeoService).
|
| 36 |
-
"""
|
| 37 |
-
# 1. Run Pure Financial Analysis (Library)
|
| 38 |
-
analyzer = FinancialAnalyzer(report)
|
| 39 |
-
results = analyzer.run_full_analysis(budget, comparisons, user_address)
|
| 40 |
-
|
| 41 |
-
# 2. Inject External Services (Geo Intelligence)
|
| 42 |
-
# This keeps the library pure and the backend handling integration
|
| 43 |
-
|
| 44 |
-
geo_analysis = None
|
| 45 |
-
analysis_address = None
|
| 46 |
-
is_own_company = False
|
| 47 |
-
|
| 48 |
-
if hasattr(report, 'company_address') and report.company_address:
|
| 49 |
-
analysis_address = report.company_address
|
| 50 |
-
if user_address and user_address.lower().strip() == report.company_address.lower().strip():
|
| 51 |
-
is_own_company = True
|
| 52 |
-
elif user_address:
|
| 53 |
-
analysis_address = user_address
|
| 54 |
-
is_own_company = True
|
| 55 |
-
else:
|
| 56 |
-
analysis_address = f"{report.company_name} Location"
|
| 57 |
-
|
| 58 |
-
if "geo_insights" in enabled_features and analysis_address:
|
| 59 |
-
try:
|
| 60 |
-
from app.services.intelligence.geo_service import GeoService
|
| 61 |
-
geo_analysis = GeoService.analyze_location(
|
| 62 |
-
analysis_address,
|
| 63 |
-
report.metrics.industry,
|
| 64 |
-
is_own_company=is_own_company,
|
| 65 |
-
company_name=report.company_name
|
| 66 |
-
)
|
| 67 |
-
except ImportError:
|
| 68 |
-
print("Warning: GeoService not available.")
|
| 69 |
-
except Exception as e:
|
| 70 |
-
print(f"Error in GeoService: {e}")
|
| 71 |
-
|
| 72 |
-
if geo_analysis:
|
| 73 |
-
results['geo_analysis'] = geo_analysis
|
| 74 |
-
|
| 75 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/growth.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
from app.schemas.financial import FinancialReport
|
| 2 |
-
|
| 3 |
-
class GrowthAnalyzer:
|
| 4 |
-
@staticmethod
|
| 5 |
-
def analyze_growth_potential(report: FinancialReport) -> str:
|
| 6 |
-
"""
|
| 7 |
-
A modular analyzer that looks for growth signals.
|
| 8 |
-
"""
|
| 9 |
-
signals = []
|
| 10 |
-
|
| 11 |
-
# In a real model, this would compare current vs previous periods.
|
| 12 |
-
# Since we only have one period in the standard import, we use heuristics or "Time Series" placeholder logic.
|
| 13 |
-
|
| 14 |
-
income = report.income_statement
|
| 15 |
-
|
| 16 |
-
if income.revenue > 1_000_000:
|
| 17 |
-
signals.append("High Volume Business: Revenue > $1M suggests established market presence.")
|
| 18 |
-
|
| 19 |
-
if income.operating_income and income.revenue:
|
| 20 |
-
if (income.operating_income / income.revenue) > 0.20:
|
| 21 |
-
signals.append("Scalable Model: Operating margins > 20% indicate high growth potential.")
|
| 22 |
-
|
| 23 |
-
if not signals:
|
| 24 |
-
return "Growth Potential: Stable / Needs more historical data."
|
| 25 |
-
|
| 26 |
-
return "Growth Potential: " + " ".join(signals)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/health_score.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
from app.schemas.financial import KPIMetrics, HealthScoreBreakdown
|
| 2 |
-
|
| 3 |
-
class HealthScoreAnalyzer:
|
| 4 |
-
@staticmethod
|
| 5 |
-
def calculate(metrics: KPIMetrics) -> HealthScoreBreakdown:
|
| 6 |
-
# 1. Stability (Liquidity/Debt) - Max 25
|
| 7 |
-
stability = 0
|
| 8 |
-
if metrics.current_ratio:
|
| 9 |
-
if metrics.current_ratio > 1.5: stability += 15
|
| 10 |
-
elif metrics.current_ratio > 1.0: stability += 10
|
| 11 |
-
if metrics.debt_to_equity:
|
| 12 |
-
if metrics.debt_to_equity < 1.0: stability += 10
|
| 13 |
-
elif metrics.debt_to_equity < 2.0: stability += 5
|
| 14 |
-
else:
|
| 15 |
-
# Assume acceptable if no debt info
|
| 16 |
-
stability += 10
|
| 17 |
-
|
| 18 |
-
# 2. Profitability (Margins) - Max 35
|
| 19 |
-
profitability = 0
|
| 20 |
-
if metrics.net_margin:
|
| 21 |
-
if metrics.net_margin > 15: profitability += 15
|
| 22 |
-
elif metrics.net_margin > 5: profitability += 10
|
| 23 |
-
elif metrics.net_margin > 0: profitability += 5
|
| 24 |
-
if metrics.gross_margin:
|
| 25 |
-
if metrics.gross_margin > 40: profitability += 10
|
| 26 |
-
elif metrics.gross_margin > 20: profitability += 5
|
| 27 |
-
if metrics.roe:
|
| 28 |
-
if metrics.roe > 15: profitability += 10
|
| 29 |
-
|
| 30 |
-
# 3. Growth (Placeholder / Revenue Trajectory) - Max 20
|
| 31 |
-
# In single snapshot, we check generic health markers
|
| 32 |
-
growth = 10 # Baseline
|
| 33 |
-
|
| 34 |
-
# 4. Efficiency - Max 20
|
| 35 |
-
efficiency = 10 # Baseline
|
| 36 |
-
if metrics.dso and metrics.dso < 45: efficiency += 10
|
| 37 |
-
|
| 38 |
-
total = min(100, stability + profitability + growth + efficiency)
|
| 39 |
-
|
| 40 |
-
return HealthScoreBreakdown(
|
| 41 |
-
stability=stability,
|
| 42 |
-
profitability=profitability,
|
| 43 |
-
growth=growth,
|
| 44 |
-
efficiency=efficiency,
|
| 45 |
-
total_score=total
|
| 46 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/kpi.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
from app.schemas.financial import IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics
|
| 2 |
-
|
| 3 |
-
class KPIAnalyzer:
|
| 4 |
-
@staticmethod
|
| 5 |
-
def calculate_metrics(report: 'FinancialReport') -> KPIMetrics:
|
| 6 |
-
income = report.income_statement
|
| 7 |
-
balance = report.balance_sheet
|
| 8 |
-
|
| 9 |
-
metrics = KPIMetrics()
|
| 10 |
-
|
| 11 |
-
# Profitability
|
| 12 |
-
rev = income.revenue or 1.0 # Avoid div by zero
|
| 13 |
-
|
| 14 |
-
metrics.gross_margin = (income.gross_profit / rev) * 100
|
| 15 |
-
metrics.operating_margin = (income.operating_income / rev) * 100
|
| 16 |
-
metrics.net_margin = (income.net_income / rev) * 100
|
| 17 |
-
|
| 18 |
-
# Liquidity
|
| 19 |
-
curr_liab = balance.total_current_liabilities or 1.0
|
| 20 |
-
metrics.current_ratio = balance.total_current_assets / curr_liab
|
| 21 |
-
|
| 22 |
-
# Solvency
|
| 23 |
-
equity = balance.total_equity or 1.0
|
| 24 |
-
if balance.total_liabilities:
|
| 25 |
-
metrics.debt_to_equity = balance.total_liabilities / equity
|
| 26 |
-
|
| 27 |
-
metrics.roe = (income.net_income / equity) * 100
|
| 28 |
-
|
| 29 |
-
# Efficiency
|
| 30 |
-
daily_sales = rev / 365
|
| 31 |
-
if daily_sales > 0 and balance.accounts_receivable:
|
| 32 |
-
metrics.dso = balance.accounts_receivable / daily_sales
|
| 33 |
-
|
| 34 |
-
# Restaurant / Service Specific
|
| 35 |
-
# Prime Cost = (COGS + Payroll) / Revenue
|
| 36 |
-
metrics.prime_cost = ((income.cogs + income.payroll_expenses) / rev) * 100
|
| 37 |
-
|
| 38 |
-
# Extracted or Calculated Extra Metrics
|
| 39 |
-
# 1. Restaurant Margin
|
| 40 |
-
if "extracted_restaurant_margin" in report.metadata:
|
| 41 |
-
try:
|
| 42 |
-
metrics.restaurant_margin = float(report.metadata["extracted_restaurant_margin"])
|
| 43 |
-
except:
|
| 44 |
-
pass
|
| 45 |
-
|
| 46 |
-
# 2. Effective Tax Rate
|
| 47 |
-
if "extracted_effective_tax_rate" in report.metadata:
|
| 48 |
-
try:
|
| 49 |
-
metrics.effective_tax_rate = float(report.metadata["extracted_effective_tax_rate"])
|
| 50 |
-
except:
|
| 51 |
-
pass
|
| 52 |
-
elif income.taxes > 0 and income.net_income > 0:
|
| 53 |
-
pre_tax = income.net_income + income.taxes
|
| 54 |
-
metrics.effective_tax_rate = (income.taxes / pre_tax) * 100
|
| 55 |
-
|
| 56 |
-
return metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/risk.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
from app.schemas.financial import KPIMetrics, RiskAnalysis
|
| 3 |
-
|
| 4 |
-
class RiskAnalyzer:
|
| 5 |
-
@staticmethod
|
| 6 |
-
def analyze(metrics: KPIMetrics, balance_cash: float = 0.0, monthly_burn: float = 0.0) -> RiskAnalysis:
|
| 7 |
-
score = 100.0
|
| 8 |
-
factors = []
|
| 9 |
-
liquidity = "Low Risk" # Default assumes good
|
| 10 |
-
solvency = "Low Risk"
|
| 11 |
-
|
| 12 |
-
# 1. Liquidity Risk (Current Ratio)
|
| 13 |
-
if metrics.current_ratio:
|
| 14 |
-
if metrics.current_ratio < 1.0:
|
| 15 |
-
score -= 20
|
| 16 |
-
factors.append("Critical: Current Ratio < 1.0 (Liquidity Issue)")
|
| 17 |
-
liquidity = "Critical"
|
| 18 |
-
elif metrics.current_ratio < 1.5:
|
| 19 |
-
score -= 10
|
| 20 |
-
factors.append("Warning: Current Ratio < 1.5")
|
| 21 |
-
liquidity = "Medium"
|
| 22 |
-
else:
|
| 23 |
-
factors.append("Unknown: Missing Current Ratio data")
|
| 24 |
-
|
| 25 |
-
# 2. Solvency Risk (Debt to Equity)
|
| 26 |
-
if metrics.debt_to_equity:
|
| 27 |
-
if metrics.debt_to_equity > 2.0:
|
| 28 |
-
score -= 15
|
| 29 |
-
factors.append("High Leverage: Debt/Equity > 2.0")
|
| 30 |
-
solvency = "High Risk"
|
| 31 |
-
elif metrics.debt_to_equity > 1.0:
|
| 32 |
-
solvency = "Medium Risk"
|
| 33 |
-
|
| 34 |
-
# 3. Profitability Risk
|
| 35 |
-
if metrics.net_margin and metrics.net_margin < 0:
|
| 36 |
-
score -= 25
|
| 37 |
-
factors.append("Loss Making: Negative Net Margin")
|
| 38 |
-
|
| 39 |
-
# 4. Burn Rate (Runway)
|
| 40 |
-
runway_months = None
|
| 41 |
-
if monthly_burn > 0:
|
| 42 |
-
runway_months = balance_cash / monthly_burn
|
| 43 |
-
if runway_months < 3:
|
| 44 |
-
score -= 25
|
| 45 |
-
factors.append(f"CRITICAL: Low Cash Runway ({runway_months:.1f} months)")
|
| 46 |
-
liquidity = "Critical"
|
| 47 |
-
elif runway_months < 6:
|
| 48 |
-
score -= 10
|
| 49 |
-
factors.append(f"Warning: Cash Runway < 6 months ({runway_months:.1f} months)")
|
| 50 |
-
|
| 51 |
-
return RiskAnalysis(
|
| 52 |
-
risk_score=max(0.0, score),
|
| 53 |
-
risk_factors=factors,
|
| 54 |
-
liquidity_risk=liquidity,
|
| 55 |
-
solvency_risk=solvency,
|
| 56 |
-
burn_rate_months=runway_months
|
| 57 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/analysis/simulation.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
| 1 |
-
from app.schemas.financial import FinancialReport, StandardizedDataPackage, KPIMetrics, RiskAnalysis, IncomeStatementStandard
|
| 2 |
-
from app.services.analysis.kpi import KPIAnalyzer
|
| 3 |
-
from app.services.analysis.risk import RiskAnalyzer
|
| 4 |
-
from app.services.analysis.health_score import HealthScoreAnalyzer
|
| 5 |
-
from app.services.analysis.fundamental import FundamentalAnalyzer
|
| 6 |
-
import copy
|
| 7 |
-
|
| 8 |
-
class SimulationService:
|
| 9 |
-
@staticmethod
|
| 10 |
-
def run_simulation(
|
| 11 |
-
original_data: FinancialReport,
|
| 12 |
-
delta_revenue_percent: float = 0.0,
|
| 13 |
-
delta_cogs_percent: float = 0.0,
|
| 14 |
-
delta_payroll_percent: float = 0.0,
|
| 15 |
-
delta_marketing_percent: float = 0.0,
|
| 16 |
-
delta_fixed_costs_percent: float = 0.0
|
| 17 |
-
) -> StandardizedDataPackage:
|
| 18 |
-
"""
|
| 19 |
-
Runs a What-If scenario on the financial data.
|
| 20 |
-
Delta percentages are passed as floats (e.g., 10.0 for +10%).
|
| 21 |
-
"""
|
| 22 |
-
|
| 23 |
-
# Deep copy to avoid mutating original
|
| 24 |
-
simulated_report = copy.deepcopy(original_data)
|
| 25 |
-
income = simulated_report.income_statement
|
| 26 |
-
|
| 27 |
-
# Apply deltas
|
| 28 |
-
if delta_revenue_percent != 0:
|
| 29 |
-
income.revenue *= (1 + delta_revenue_percent / 100)
|
| 30 |
-
|
| 31 |
-
if delta_cogs_percent != 0:
|
| 32 |
-
income.cogs *= (1 + delta_cogs_percent / 100)
|
| 33 |
-
|
| 34 |
-
if delta_payroll_percent != 0:
|
| 35 |
-
income.payroll_expenses *= (1 + delta_payroll_percent / 100)
|
| 36 |
-
|
| 37 |
-
if delta_marketing_percent != 0:
|
| 38 |
-
income.marketing_expenses *= (1 + delta_marketing_percent / 100)
|
| 39 |
-
|
| 40 |
-
if delta_fixed_costs_percent != 0:
|
| 41 |
-
income.rent_expense *= (1 + delta_fixed_costs_percent / 100)
|
| 42 |
-
income.other_operating_expenses *= (1 + delta_fixed_costs_percent / 100)
|
| 43 |
-
|
| 44 |
-
# Re-calculate dependent fields
|
| 45 |
-
# Note: In a real complex model, variable costs might scale with revenue automatically.
|
| 46 |
-
# Here we assume structure stays static unless explicitly modified.
|
| 47 |
-
|
| 48 |
-
# Re-run Full Analysis (Phase 3 Update)
|
| 49 |
-
# Instead of calling individual analyzers, call the main FundamentalAnalyzer
|
| 50 |
-
# This ensures simulated data gets Runway, Optimization, etc.
|
| 51 |
-
|
| 52 |
-
full_analysis = FundamentalAnalyzer.analyze(simulated_report)
|
| 53 |
-
|
| 54 |
-
# Override insights to show what changed
|
| 55 |
-
sim_summary = f"Simulation: Rev {delta_revenue_percent:+.0f}%, COGS {delta_cogs_percent:+.0f}%, Mkt {delta_marketing_percent:+.0f}%, Fixed {delta_fixed_costs_percent:+.0f}%"
|
| 56 |
-
full_analysis['insights'].insert(0, sim_summary)
|
| 57 |
-
|
| 58 |
-
return StandardizedDataPackage(
|
| 59 |
-
raw_data=simulated_report,
|
| 60 |
-
kpis=full_analysis['kpis'],
|
| 61 |
-
risk_analysis=full_analysis['risk_analysis'],
|
| 62 |
-
health_score=full_analysis['health_score'],
|
| 63 |
-
insights=full_analysis['insights'],
|
| 64 |
-
recommendations=full_analysis['recommendations'],
|
| 65 |
-
runway_forecast=full_analysis['runway_forecast'],
|
| 66 |
-
optimization_insights=full_analysis['optimization_insights']
|
| 67 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/feature_service.py
DELETED
|
@@ -1,306 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Feature Service - Business logic for feature flag management.
|
| 3 |
-
|
| 4 |
-
Handles the resolution of feature availability considering:
|
| 5 |
-
1. Admin overrides (from database)
|
| 6 |
-
2. Plan defaults (from plan_config.py)
|
| 7 |
-
3. Feature registry validation
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
from typing import List, Dict, Optional, Any
|
| 11 |
-
from sqlalchemy.orm import Session
|
| 12 |
-
from datetime import datetime, timedelta
|
| 13 |
-
|
| 14 |
-
from app.core.feature_registry import (
|
| 15 |
-
get_all_features,
|
| 16 |
-
get_feature_by_id,
|
| 17 |
-
get_all_feature_ids,
|
| 18 |
-
get_features_by_category,
|
| 19 |
-
Feature
|
| 20 |
-
)
|
| 21 |
-
from app.core.plan_config import (
|
| 22 |
-
get_default_features,
|
| 23 |
-
get_upload_limit as get_default_upload_limit,
|
| 24 |
-
get_all_plans,
|
| 25 |
-
get_all_engines,
|
| 26 |
-
PLAN_DEFAULTS
|
| 27 |
-
)
|
| 28 |
-
from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
|
| 29 |
-
from app.models.user import User
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def get_effective_features(db: Session, plan_name: str) -> List[str]:
|
| 33 |
-
"""
|
| 34 |
-
Get the list of enabled feature IDs for a plan,
|
| 35 |
-
considering admin overrides.
|
| 36 |
-
|
| 37 |
-
Resolution order:
|
| 38 |
-
1. Start with plan defaults from plan_config.py
|
| 39 |
-
2. Apply any overrides from database
|
| 40 |
-
"""
|
| 41 |
-
# Get default features for plan
|
| 42 |
-
default_features = set(get_default_features(plan_name))
|
| 43 |
-
|
| 44 |
-
# Get all overrides for this plan
|
| 45 |
-
overrides = db.query(PlanFeatureOverride).filter(
|
| 46 |
-
PlanFeatureOverride.plan_name == plan_name
|
| 47 |
-
).all()
|
| 48 |
-
|
| 49 |
-
# Apply overrides
|
| 50 |
-
for override in overrides:
|
| 51 |
-
if override.enabled:
|
| 52 |
-
default_features.add(override.feature_id)
|
| 53 |
-
else:
|
| 54 |
-
default_features.discard(override.feature_id)
|
| 55 |
-
|
| 56 |
-
return list(default_features)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def is_feature_enabled(db: Session, plan_name: str, feature_id: str) -> bool:
|
| 60 |
-
"""Check if a specific feature is enabled for a plan."""
|
| 61 |
-
enabled_features = get_effective_features(db, plan_name)
|
| 62 |
-
return feature_id in enabled_features
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def resolve_user_features(db: Session, user: User) -> List[str]:
|
| 66 |
-
"""
|
| 67 |
-
Resolve final feature flags for a user, combining:
|
| 68 |
-
1. Plan Entitlements (Base)
|
| 69 |
-
2. User-Specific Overrides (Add-ons/Removals) -> stored in user.custom_features
|
| 70 |
-
3. Engine Constraints (Hard Limit)
|
| 71 |
-
|
| 72 |
-
Returns: List of enabled feature IDs.
|
| 73 |
-
"""
|
| 74 |
-
# 1. Base Plan Features
|
| 75 |
-
current_plan = user.plan or "Free"
|
| 76 |
-
if user.is_admin:
|
| 77 |
-
current_plan = "Admin"
|
| 78 |
-
|
| 79 |
-
plan_features = set(get_effective_features(db, current_plan))
|
| 80 |
-
|
| 81 |
-
# 2. Apply User Custom Overrides (Add-ons / Removals)
|
| 82 |
-
# user.custom_features is a JSON dict { "feature_id": bool }
|
| 83 |
-
# Ensure it's a dict (SQLAlchemy JSON might return None if default not applied yet)
|
| 84 |
-
custom_map = user.custom_features or {}
|
| 85 |
-
if isinstance(custom_map, str):
|
| 86 |
-
# Handle case with SQLite where it might be stored as string
|
| 87 |
-
import json
|
| 88 |
-
try:
|
| 89 |
-
custom_map = json.loads(custom_map)
|
| 90 |
-
except:
|
| 91 |
-
custom_map = {}
|
| 92 |
-
|
| 93 |
-
for fid, enabled in custom_map.items():
|
| 94 |
-
if enabled:
|
| 95 |
-
plan_features.add(fid)
|
| 96 |
-
elif fid in plan_features:
|
| 97 |
-
plan_features.remove(fid)
|
| 98 |
-
|
| 99 |
-
# 3. Apply Engine Constraints (Hardware Limits)
|
| 100 |
-
# Default to v1 if not set
|
| 101 |
-
engine_pref = getattr(user, "preferred_engine", "v1") or "v1"
|
| 102 |
-
engine_key = f"_ENGINE_{engine_pref}"
|
| 103 |
-
|
| 104 |
-
# Get engine allowed features
|
| 105 |
-
engine_features = set(get_effective_features(db, engine_key))
|
| 106 |
-
|
| 107 |
-
# Final Result = (Plan U Custom) INTERSECT Engine
|
| 108 |
-
return list(plan_features.intersection(engine_features))
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def get_effective_upload_limit(db: Session, plan_name: str) -> int:
|
| 113 |
-
"""
|
| 114 |
-
Get the upload limit for a plan, considering admin overrides.
|
| 115 |
-
"""
|
| 116 |
-
# Check for override
|
| 117 |
-
override = db.query(PlanUploadLimit).filter(
|
| 118 |
-
PlanUploadLimit.plan_name == plan_name
|
| 119 |
-
).first()
|
| 120 |
-
|
| 121 |
-
if override:
|
| 122 |
-
return override.upload_limit
|
| 123 |
-
|
| 124 |
-
return get_default_upload_limit(plan_name)
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def get_all_plan_features(db: Session) -> Dict[str, Dict[str, Any]]:
|
| 128 |
-
"""
|
| 129 |
-
Get feature configuration for all plans.
|
| 130 |
-
Returns a dict with plan names as keys and feature configs as values.
|
| 131 |
-
"""
|
| 132 |
-
all_feature_ids = get_all_feature_ids()
|
| 133 |
-
result = {}
|
| 134 |
-
|
| 135 |
-
for plan_name in get_all_plans():
|
| 136 |
-
enabled_features = get_effective_features(db, plan_name)
|
| 137 |
-
upload_limit = get_effective_upload_limit(db, plan_name)
|
| 138 |
-
|
| 139 |
-
result[plan_name] = {
|
| 140 |
-
"upload_limit": upload_limit,
|
| 141 |
-
"features": {
|
| 142 |
-
fid: fid in enabled_features
|
| 143 |
-
for fid in all_feature_ids
|
| 144 |
-
}
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
return result
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
def get_feature_matrix(db: Session) -> Dict[str, Any]:
|
| 151 |
-
"""
|
| 152 |
-
Get feature matrix for admin console display.
|
| 153 |
-
Includes categories, features, and per-plan enablement.
|
| 154 |
-
"""
|
| 155 |
-
categories = get_features_by_category()
|
| 156 |
-
plans = get_all_plans()
|
| 157 |
-
engines = get_all_engines()
|
| 158 |
-
|
| 159 |
-
# Build matrix
|
| 160 |
-
matrix = {}
|
| 161 |
-
for cat_name, features in categories.items():
|
| 162 |
-
matrix[cat_name] = []
|
| 163 |
-
for feature in features:
|
| 164 |
-
row = {
|
| 165 |
-
"id": feature.id,
|
| 166 |
-
"name": feature.name,
|
| 167 |
-
"description": feature.description,
|
| 168 |
-
"memory_cost_mb": getattr(feature, "memory_cost_mb", 0),
|
| 169 |
-
"plans": {},
|
| 170 |
-
"engines": {}
|
| 171 |
-
}
|
| 172 |
-
for plan in plans:
|
| 173 |
-
row["plans"][plan] = is_feature_enabled(db, plan, feature.id)
|
| 174 |
-
for engine in engines:
|
| 175 |
-
row["engines"][engine] = is_feature_enabled(db, engine, feature.id)
|
| 176 |
-
matrix[cat_name].append(row)
|
| 177 |
-
|
| 178 |
-
return {
|
| 179 |
-
"categories": list(categories.keys()),
|
| 180 |
-
"plans": plans,
|
| 181 |
-
"engines": engines,
|
| 182 |
-
"matrix": matrix
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
def set_feature_override(
|
| 187 |
-
db: Session,
|
| 188 |
-
plan_name: str,
|
| 189 |
-
feature_id: str,
|
| 190 |
-
enabled: bool,
|
| 191 |
-
admin_id: Optional[int] = None
|
| 192 |
-
) -> PlanFeatureOverride:
|
| 193 |
-
"""
|
| 194 |
-
Set or update a feature override for a plan.
|
| 195 |
-
"""
|
| 196 |
-
# Validate feature exists
|
| 197 |
-
if not get_feature_by_id(feature_id):
|
| 198 |
-
raise ValueError(f"Unknown feature ID: {feature_id}")
|
| 199 |
-
|
| 200 |
-
# Find or create override
|
| 201 |
-
override = db.query(PlanFeatureOverride).filter(
|
| 202 |
-
PlanFeatureOverride.plan_name == plan_name,
|
| 203 |
-
PlanFeatureOverride.feature_id == feature_id
|
| 204 |
-
).first()
|
| 205 |
-
|
| 206 |
-
if override:
|
| 207 |
-
override.enabled = enabled
|
| 208 |
-
override.updated_by_id = admin_id
|
| 209 |
-
else:
|
| 210 |
-
override = PlanFeatureOverride(
|
| 211 |
-
plan_name=plan_name,
|
| 212 |
-
feature_id=feature_id,
|
| 213 |
-
enabled=enabled,
|
| 214 |
-
updated_by_id=admin_id
|
| 215 |
-
)
|
| 216 |
-
db.add(override)
|
| 217 |
-
|
| 218 |
-
db.commit()
|
| 219 |
-
db.refresh(override)
|
| 220 |
-
return override
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
def bulk_set_features(
|
| 224 |
-
db: Session,
|
| 225 |
-
plan_name: str,
|
| 226 |
-
feature_states: Dict[str, bool],
|
| 227 |
-
admin_id: Optional[int] = None
|
| 228 |
-
) -> int:
|
| 229 |
-
"""
|
| 230 |
-
Bulk update feature states for a plan.
|
| 231 |
-
Returns count of updated features.
|
| 232 |
-
"""
|
| 233 |
-
count = 0
|
| 234 |
-
for feature_id, enabled in feature_states.items():
|
| 235 |
-
set_feature_override(db, plan_name, feature_id, enabled, admin_id)
|
| 236 |
-
count += 1
|
| 237 |
-
return count
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
def reset_plan_to_defaults(db: Session, plan_name: str) -> int:
|
| 241 |
-
"""
|
| 242 |
-
Remove all overrides for a plan, reverting to defaults.
|
| 243 |
-
Returns count of deleted overrides.
|
| 244 |
-
"""
|
| 245 |
-
result = db.query(PlanFeatureOverride).filter(
|
| 246 |
-
PlanFeatureOverride.plan_name == plan_name
|
| 247 |
-
).delete()
|
| 248 |
-
db.commit()
|
| 249 |
-
return result
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
def check_upload_limit(db: Session, user: User) -> Dict[str, Any]:
|
| 253 |
-
"""
|
| 254 |
-
Check if user can upload, considering their plan limit.
|
| 255 |
-
Also handles monthly reset.
|
| 256 |
-
|
| 257 |
-
Returns:
|
| 258 |
-
{
|
| 259 |
-
"can_upload": bool,
|
| 260 |
-
"uploads_used": int,
|
| 261 |
-
"uploads_limit": int,
|
| 262 |
-
"uploads_remaining": int,
|
| 263 |
-
"reset_date": datetime
|
| 264 |
-
}
|
| 265 |
-
"""
|
| 266 |
-
# Check if we need to reset monthly count
|
| 267 |
-
now = datetime.utcnow()
|
| 268 |
-
if user.upload_reset_date:
|
| 269 |
-
days_since_reset = (now - user.upload_reset_date).days
|
| 270 |
-
if days_since_reset >= 30:
|
| 271 |
-
user.monthly_upload_count = 0
|
| 272 |
-
user.upload_reset_date = now
|
| 273 |
-
db.commit()
|
| 274 |
-
else:
|
| 275 |
-
user.upload_reset_date = now
|
| 276 |
-
db.commit()
|
| 277 |
-
|
| 278 |
-
# Get effective limit
|
| 279 |
-
plan = user.plan or "Individual"
|
| 280 |
-
if user.is_admin:
|
| 281 |
-
plan = "Admin"
|
| 282 |
-
|
| 283 |
-
limit = get_effective_upload_limit(db, plan)
|
| 284 |
-
used = user.monthly_upload_count or 0
|
| 285 |
-
remaining = max(0, limit - used)
|
| 286 |
-
|
| 287 |
-
# Calculate next reset
|
| 288 |
-
next_reset = user.upload_reset_date + timedelta(days=30) if user.upload_reset_date else now + timedelta(days=30)
|
| 289 |
-
|
| 290 |
-
return {
|
| 291 |
-
"can_upload": used < limit,
|
| 292 |
-
"uploads_used": used,
|
| 293 |
-
"uploads_limit": limit,
|
| 294 |
-
"uploads_remaining": remaining,
|
| 295 |
-
"reset_date": next_reset.isoformat()
|
| 296 |
-
}
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
def increment_upload_count(db: Session, user: User) -> int:
|
| 300 |
-
"""
|
| 301 |
-
Increment user's upload count. Call after successful upload.
|
| 302 |
-
Returns new count.
|
| 303 |
-
"""
|
| 304 |
-
user.monthly_upload_count = (user.monthly_upload_count or 0) + 1
|
| 305 |
-
db.commit()
|
| 306 |
-
return user.monthly_upload_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/__init__.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Ingestion Layer - File parsing and data extraction.
|
| 3 |
-
|
| 4 |
-
This package handles parsing of various financial document formats
|
| 5 |
-
and standardizing them into a common FinancialReport schema.
|
| 6 |
-
|
| 7 |
-
## Supported Formats
|
| 8 |
-
|
| 9 |
-
| Format | Parser | Description |
|
| 10 |
-
|--------|--------|-------------|
|
| 11 |
-
| CSV | CSVParser | Comma-separated financial data |
|
| 12 |
-
| PDF | HybridPDFParser | Dolphin-v2 + pdfplumber hybrid extraction |
|
| 13 |
-
| PDF | PDFParser | Legacy pdfplumber-only parser |
|
| 14 |
-
| XLSX/XLS | XLSXParser | Excel workbooks |
|
| 15 |
-
|
| 16 |
-
## PDF Hybrid Architecture
|
| 17 |
-
|
| 18 |
-
PDF files are processed by both Dolphin-v2 and pdfplumber:
|
| 19 |
-
1. Dolphin: layout analysis, document classification, element extraction
|
| 20 |
-
2. pdfplumber: gap-filling table + regex extraction
|
| 21 |
-
3. Merge: Dolphin fields take priority, pdfplumber fills gaps
|
| 22 |
-
|
| 23 |
-
If Dolphin is not installed, falls back to pdfplumber-only automatically.
|
| 24 |
-
|
| 25 |
-
## Usage
|
| 26 |
-
|
| 27 |
-
Use UnifiedParser for automatic format detection:
|
| 28 |
-
|
| 29 |
-
```python
|
| 30 |
-
from app.services.ingestion import UnifiedParser
|
| 31 |
-
|
| 32 |
-
report = UnifiedParser.parse(file_path, original_filename)
|
| 33 |
-
```
|
| 34 |
-
|
| 35 |
-
Or use specific parsers directly:
|
| 36 |
-
|
| 37 |
-
```python
|
| 38 |
-
from app.services.ingestion import CSVParser, HybridPDFParser, XLSXParser
|
| 39 |
-
|
| 40 |
-
report = CSVParser.parse(file_path)
|
| 41 |
-
report = HybridPDFParser.parse(file_path) # Dolphin + pdfplumber
|
| 42 |
-
report = XLSXParser.parse(file_path)
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
## Adding New Formats
|
| 46 |
-
|
| 47 |
-
1. Create `parser_xxx.py` with a class implementing `parse(file_path) -> FinancialReport`
|
| 48 |
-
2. Register in `unified_parser.py` SUPPORTED_EXTENSIONS dict
|
| 49 |
-
3. Add import in this `__init__.py`
|
| 50 |
-
"""
|
| 51 |
-
|
| 52 |
-
from app.services.ingestion.unified_parser import UnifiedParser
|
| 53 |
-
from app.services.ingestion.parser_csv import CSVParser
|
| 54 |
-
from app.services.ingestion.parser_pdf import PDFParser
|
| 55 |
-
from app.services.ingestion.parser_dolphin import HybridPDFParser
|
| 56 |
-
from app.services.ingestion.parser_xlsx import XLSXParser
|
| 57 |
-
from app.services.ingestion.mappings import DataMapper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/dolphin/__init__.py
DELETED
|
@@ -1,158 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Dolphin PDF Extraction Module — Hybrid Architecture.
|
| 3 |
-
|
| 4 |
-
Uses ByteDance Dolphin-v2 for advanced document layout analysis,
|
| 5 |
-
classification, and element extraction, combined with pdfplumber
|
| 6 |
-
for gap-filling and validation.
|
| 7 |
-
|
| 8 |
-
## Quick Check
|
| 9 |
-
|
| 10 |
-
```python
|
| 11 |
-
from app.services.ingestion.dolphin import is_dolphin_available, ensure_model_downloaded
|
| 12 |
-
|
| 13 |
-
if is_dolphin_available():
|
| 14 |
-
from app.services.ingestion.dolphin.client import DolphinClient
|
| 15 |
-
client = DolphinClient()
|
| 16 |
-
```
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
import os
|
| 20 |
-
import logging
|
| 21 |
-
from typing import Optional
|
| 22 |
-
|
| 23 |
-
logger = logging.getLogger(__name__)
|
| 24 |
-
|
| 25 |
-
# Default model storage location (relative to backend root)
|
| 26 |
-
DEFAULT_MODEL_DIR = os.path.join(
|
| 27 |
-
os.path.dirname(os.path.abspath(__file__)),
|
| 28 |
-
"..", "..", "..", "..", "models", "dolphin-v2"
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
_dolphin_available: Optional[bool] = None
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def _detect_device() -> str:
|
| 35 |
-
"""Auto-detect best available compute device: cuda > mps > cpu."""
|
| 36 |
-
try:
|
| 37 |
-
import torch
|
| 38 |
-
if torch.cuda.is_available():
|
| 39 |
-
logger.info("Dolphin device: CUDA GPU detected")
|
| 40 |
-
return "cuda"
|
| 41 |
-
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 42 |
-
logger.info("Dolphin device: Apple MPS (Metal) detected")
|
| 43 |
-
return "mps"
|
| 44 |
-
except ImportError:
|
| 45 |
-
pass
|
| 46 |
-
logger.info("Dolphin device: CPU mode")
|
| 47 |
-
return "cpu"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def _get_model_path() -> str:
|
| 51 |
-
"""Resolve model path from config or default."""
|
| 52 |
-
try:
|
| 53 |
-
from app.core.config import settings
|
| 54 |
-
if settings.DOLPHIN_MODEL_PATH:
|
| 55 |
-
return settings.DOLPHIN_MODEL_PATH
|
| 56 |
-
except Exception:
|
| 57 |
-
pass
|
| 58 |
-
return os.path.abspath(DEFAULT_MODEL_DIR)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def is_dolphin_available() -> bool:
|
| 62 |
-
"""
|
| 63 |
-
Check if Dolphin model and dependencies are installed.
|
| 64 |
-
Result is cached after first check.
|
| 65 |
-
"""
|
| 66 |
-
global _dolphin_available
|
| 67 |
-
if _dolphin_available is not None:
|
| 68 |
-
return _dolphin_available
|
| 69 |
-
|
| 70 |
-
# If remote API is configured, we consider Dolphin available
|
| 71 |
-
# (The remote worker manages the model)
|
| 72 |
-
from app.core.config import settings
|
| 73 |
-
if settings.DOLPHIN_API_URL:
|
| 74 |
-
_dolphin_available = True
|
| 75 |
-
return True
|
| 76 |
-
|
| 77 |
-
try:
|
| 78 |
-
import torch # noqa: F401
|
| 79 |
-
import transformers # noqa: F401
|
| 80 |
-
from PIL import Image # noqa: F401
|
| 81 |
-
|
| 82 |
-
model_path = _get_model_path()
|
| 83 |
-
if os.path.isdir(model_path):
|
| 84 |
-
# Check for key model files
|
| 85 |
-
has_config = os.path.exists(os.path.join(model_path, "config.json"))
|
| 86 |
-
has_weights = (
|
| 87 |
-
os.path.exists(os.path.join(model_path, "model.safetensors"))
|
| 88 |
-
or os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
|
| 89 |
-
or any(f.startswith("model-") for f in os.listdir(model_path) if f.endswith(".safetensors"))
|
| 90 |
-
)
|
| 91 |
-
_dolphin_available = has_config and has_weights
|
| 92 |
-
else:
|
| 93 |
-
_dolphin_available = False
|
| 94 |
-
|
| 95 |
-
except ImportError as e:
|
| 96 |
-
logger.debug(f"Dolphin dependencies not installed: {e}")
|
| 97 |
-
_dolphin_available = False
|
| 98 |
-
|
| 99 |
-
logger.info(f"Dolphin availability: {_dolphin_available}")
|
| 100 |
-
return _dolphin_available
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def ensure_model_downloaded(force: bool = False) -> str:
|
| 104 |
-
"""
|
| 105 |
-
Download Dolphin-v2 model from HuggingFace if not already present.
|
| 106 |
-
|
| 107 |
-
Args:
|
| 108 |
-
force: If True, re-download even if model exists
|
| 109 |
-
|
| 110 |
-
Returns:
|
| 111 |
-
Path to the downloaded model directory
|
| 112 |
-
"""
|
| 113 |
-
model_path = _get_model_path()
|
| 114 |
-
|
| 115 |
-
if not force and os.path.isdir(model_path):
|
| 116 |
-
config_path = os.path.join(model_path, "config.json")
|
| 117 |
-
if os.path.exists(config_path):
|
| 118 |
-
logger.info(f"Dolphin model already present at {model_path}")
|
| 119 |
-
return model_path
|
| 120 |
-
|
| 121 |
-
logger.info("Downloading Dolphin-v2 model from HuggingFace...")
|
| 122 |
-
|
| 123 |
-
try:
|
| 124 |
-
from huggingface_hub import snapshot_download
|
| 125 |
-
|
| 126 |
-
os.makedirs(model_path, exist_ok=True)
|
| 127 |
-
snapshot_download(
|
| 128 |
-
repo_id="ByteDance/Dolphin-v2",
|
| 129 |
-
local_dir=model_path,
|
| 130 |
-
local_dir_use_symlinks=False,
|
| 131 |
-
)
|
| 132 |
-
logger.info(f"Dolphin-v2 model downloaded to {model_path}")
|
| 133 |
-
|
| 134 |
-
# Invalidate cache so next check picks up the new model
|
| 135 |
-
global _dolphin_available
|
| 136 |
-
_dolphin_available = None
|
| 137 |
-
|
| 138 |
-
return model_path
|
| 139 |
-
|
| 140 |
-
except Exception as e:
|
| 141 |
-
logger.error(f"Failed to download Dolphin model: {e}")
|
| 142 |
-
raise RuntimeError(
|
| 143 |
-
f"Dolphin model download failed: {e}. "
|
| 144 |
-
"Install huggingface-hub and ensure network access, "
|
| 145 |
-
"or manually download to: {model_path}"
|
| 146 |
-
) from e
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
def get_device() -> str:
|
| 150 |
-
"""Get configured or auto-detected device."""
|
| 151 |
-
try:
|
| 152 |
-
from app.core.config import settings
|
| 153 |
-
device = getattr(settings, "DOLPHIN_DEVICE", "auto")
|
| 154 |
-
if device != "auto":
|
| 155 |
-
return device
|
| 156 |
-
except Exception:
|
| 157 |
-
pass
|
| 158 |
-
return _detect_device()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/dolphin/classifier.py
DELETED
|
@@ -1,288 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Document Classifier — Identifies financial document types from parsed content.
|
| 3 |
-
|
| 4 |
-
Uses Dolphin's structured output (headings, sections, tables) to classify
|
| 5 |
-
PDFs into specific financial document categories with confidence scoring.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import re
|
| 9 |
-
import logging
|
| 10 |
-
from typing import List, Dict, Tuple, Optional
|
| 11 |
-
from dataclasses import dataclass, field
|
| 12 |
-
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
@dataclass
|
| 17 |
-
class DocumentClassification:
|
| 18 |
-
"""Classification result for a parsed document."""
|
| 19 |
-
doc_type: str # Primary document type
|
| 20 |
-
confidence: float # 0.0 - 1.0
|
| 21 |
-
detected_sections: List[str] = field(default_factory=list)
|
| 22 |
-
extraction_method: str = "dolphin_hybrid"
|
| 23 |
-
secondary_types: List[str] = field(default_factory=list) # Additional statements found
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
# ---------------------------------------------------------------------------
|
| 27 |
-
# Document type signature definitions
|
| 28 |
-
# ---------------------------------------------------------------------------
|
| 29 |
-
|
| 30 |
-
# Each type maps to: (required_keywords, optional_keywords, min_required_matches)
|
| 31 |
-
DOCUMENT_SIGNATURES: Dict[str, Dict] = {
|
| 32 |
-
"10-K": {
|
| 33 |
-
"keywords": [
|
| 34 |
-
"form 10-k", "annual report", "securities and exchange commission",
|
| 35 |
-
"fiscal year ended", "10-k", "annual report pursuant",
|
| 36 |
-
],
|
| 37 |
-
"sections": [
|
| 38 |
-
"consolidated statements of operations",
|
| 39 |
-
"consolidated balance sheets",
|
| 40 |
-
"consolidated statements of cash flows",
|
| 41 |
-
"management's discussion and analysis",
|
| 42 |
-
"risk factors",
|
| 43 |
-
],
|
| 44 |
-
"min_keyword_matches": 2,
|
| 45 |
-
"min_section_matches": 2,
|
| 46 |
-
},
|
| 47 |
-
"10-Q": {
|
| 48 |
-
"keywords": [
|
| 49 |
-
"form 10-q", "quarterly report", "securities and exchange commission",
|
| 50 |
-
"fiscal quarter", "10-q", "quarterly report pursuant",
|
| 51 |
-
],
|
| 52 |
-
"sections": [
|
| 53 |
-
"condensed consolidated statements",
|
| 54 |
-
"condensed consolidated balance",
|
| 55 |
-
"management's discussion",
|
| 56 |
-
],
|
| 57 |
-
"min_keyword_matches": 2,
|
| 58 |
-
"min_section_matches": 1,
|
| 59 |
-
},
|
| 60 |
-
"income_statement": {
|
| 61 |
-
"keywords": [
|
| 62 |
-
"income statement", "statement of operations", "statement of earnings",
|
| 63 |
-
"profit and loss", "p&l", "statement of income",
|
| 64 |
-
"consolidated statements of operations",
|
| 65 |
-
"consolidated statements of income",
|
| 66 |
-
],
|
| 67 |
-
"sections": [
|
| 68 |
-
"revenue", "net income", "operating income", "gross profit",
|
| 69 |
-
"cost of goods sold", "operating expenses",
|
| 70 |
-
],
|
| 71 |
-
"min_keyword_matches": 1,
|
| 72 |
-
"min_section_matches": 2,
|
| 73 |
-
},
|
| 74 |
-
"balance_sheet": {
|
| 75 |
-
"keywords": [
|
| 76 |
-
"balance sheet", "statement of financial position",
|
| 77 |
-
"consolidated balance sheets",
|
| 78 |
-
],
|
| 79 |
-
"sections": [
|
| 80 |
-
"total assets", "total liabilities", "stockholders' equity",
|
| 81 |
-
"current assets", "current liabilities", "cash and equivalents",
|
| 82 |
-
],
|
| 83 |
-
"min_keyword_matches": 1,
|
| 84 |
-
"min_section_matches": 2,
|
| 85 |
-
},
|
| 86 |
-
"cash_flow_statement": {
|
| 87 |
-
"keywords": [
|
| 88 |
-
"cash flow", "statement of cash flows",
|
| 89 |
-
"consolidated statements of cash flows",
|
| 90 |
-
],
|
| 91 |
-
"sections": [
|
| 92 |
-
"operating activities", "investing activities",
|
| 93 |
-
"financing activities", "net change in cash",
|
| 94 |
-
],
|
| 95 |
-
"min_keyword_matches": 1,
|
| 96 |
-
"min_section_matches": 2,
|
| 97 |
-
},
|
| 98 |
-
"bank_statement": {
|
| 99 |
-
"keywords": [
|
| 100 |
-
"bank statement", "account statement", "transaction history",
|
| 101 |
-
"account summary", "statement period", "beginning balance",
|
| 102 |
-
"ending balance",
|
| 103 |
-
],
|
| 104 |
-
"sections": [
|
| 105 |
-
"deposits", "withdrawals", "balance", "transaction date",
|
| 106 |
-
],
|
| 107 |
-
"min_keyword_matches": 2,
|
| 108 |
-
"min_section_matches": 1,
|
| 109 |
-
},
|
| 110 |
-
"invoice": {
|
| 111 |
-
"keywords": [
|
| 112 |
-
"invoice", "bill to", "ship to", "due date", "invoice number",
|
| 113 |
-
"purchase order", "payment terms", "amount due",
|
| 114 |
-
],
|
| 115 |
-
"sections": [
|
| 116 |
-
"subtotal", "tax", "total", "description", "quantity", "unit price",
|
| 117 |
-
],
|
| 118 |
-
"min_keyword_matches": 2,
|
| 119 |
-
"min_section_matches": 2,
|
| 120 |
-
},
|
| 121 |
-
"tax_return": {
|
| 122 |
-
"keywords": [
|
| 123 |
-
"tax return", "form 1040", "form 1120", "form 990",
|
| 124 |
-
"internal revenue service", "irs", "taxable income",
|
| 125 |
-
"adjusted gross income", "tax liability",
|
| 126 |
-
],
|
| 127 |
-
"sections": [
|
| 128 |
-
"income", "deductions", "credits", "tax due", "refund",
|
| 129 |
-
],
|
| 130 |
-
"min_keyword_matches": 2,
|
| 131 |
-
"min_section_matches": 1,
|
| 132 |
-
},
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
class DocumentClassifier:
|
| 137 |
-
"""
|
| 138 |
-
Classifies financial documents from parsed content.
|
| 139 |
-
|
| 140 |
-
Uses a weighted keyword + section matching strategy against
|
| 141 |
-
known document type signatures.
|
| 142 |
-
|
| 143 |
-
Usage:
|
| 144 |
-
classifier = DocumentClassifier()
|
| 145 |
-
result = classifier.classify(full_text, dolphin_sections)
|
| 146 |
-
"""
|
| 147 |
-
|
| 148 |
-
@staticmethod
|
| 149 |
-
def classify(
|
| 150 |
-
text_content: str,
|
| 151 |
-
dolphin_sections: Optional[List[Dict]] = None,
|
| 152 |
-
dolphin_elements: Optional[list] = None,
|
| 153 |
-
) -> DocumentClassification:
|
| 154 |
-
"""
|
| 155 |
-
Classify the document based on text content and structural elements.
|
| 156 |
-
|
| 157 |
-
Args:
|
| 158 |
-
text_content: Full extracted text from the document
|
| 159 |
-
dolphin_sections: Layout sections from Dolphin (if available)
|
| 160 |
-
dolphin_elements: Parsed elements from Dolphin (if available)
|
| 161 |
-
|
| 162 |
-
Returns:
|
| 163 |
-
DocumentClassification with type, confidence, and detected sections
|
| 164 |
-
"""
|
| 165 |
-
if not text_content:
|
| 166 |
-
return DocumentClassification(
|
| 167 |
-
doc_type="general_financial",
|
| 168 |
-
confidence=0.0,
|
| 169 |
-
extraction_method="dolphin_hybrid",
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
text_lower = text_content.lower()
|
| 173 |
-
scores: Dict[str, float] = {}
|
| 174 |
-
section_matches: Dict[str, List[str]] = {}
|
| 175 |
-
|
| 176 |
-
for doc_type, signature in DOCUMENT_SIGNATURES.items():
|
| 177 |
-
score, matched_sections = DocumentClassifier._score_document(
|
| 178 |
-
text_lower, signature, dolphin_sections
|
| 179 |
-
)
|
| 180 |
-
scores[doc_type] = score
|
| 181 |
-
section_matches[doc_type] = matched_sections
|
| 182 |
-
|
| 183 |
-
# Find best match
|
| 184 |
-
if not scores or max(scores.values()) == 0:
|
| 185 |
-
return DocumentClassification(
|
| 186 |
-
doc_type="general_financial",
|
| 187 |
-
confidence=0.1,
|
| 188 |
-
extraction_method="dolphin_hybrid",
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
best_type = max(scores, key=scores.get) # type: ignore[arg-type]
|
| 192 |
-
best_score = scores[best_type]
|
| 193 |
-
|
| 194 |
-
# Normalize confidence to 0-1 range (max theoretical ~1.0)
|
| 195 |
-
confidence = min(best_score / 10.0, 1.0)
|
| 196 |
-
|
| 197 |
-
# Find secondary types (other statements detected within the doc)
|
| 198 |
-
secondary = [
|
| 199 |
-
t for t, s in scores.items()
|
| 200 |
-
if s > 2.0 and t != best_type
|
| 201 |
-
]
|
| 202 |
-
|
| 203 |
-
return DocumentClassification(
|
| 204 |
-
doc_type=best_type,
|
| 205 |
-
confidence=round(confidence, 3),
|
| 206 |
-
detected_sections=section_matches.get(best_type, []),
|
| 207 |
-
extraction_method="dolphin_hybrid",
|
| 208 |
-
secondary_types=secondary,
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
@staticmethod
|
| 212 |
-
def _score_document(
|
| 213 |
-
text_lower: str,
|
| 214 |
-
signature: Dict,
|
| 215 |
-
dolphin_sections: Optional[List[Dict]] = None,
|
| 216 |
-
) -> Tuple[float, List[str]]:
|
| 217 |
-
"""
|
| 218 |
-
Score a document against a type signature.
|
| 219 |
-
|
| 220 |
-
Returns (score, list_of_matched_sections).
|
| 221 |
-
"""
|
| 222 |
-
keyword_hits = 0
|
| 223 |
-
for kw in signature["keywords"]:
|
| 224 |
-
if kw in text_lower:
|
| 225 |
-
keyword_hits += 1
|
| 226 |
-
|
| 227 |
-
section_hits = 0
|
| 228 |
-
matched_sections = []
|
| 229 |
-
for sec in signature["sections"]:
|
| 230 |
-
if sec in text_lower:
|
| 231 |
-
section_hits += 1
|
| 232 |
-
matched_sections.append(sec)
|
| 233 |
-
|
| 234 |
-
# Bonus from Dolphin structural analysis
|
| 235 |
-
dolphin_bonus = 0.0
|
| 236 |
-
if dolphin_sections:
|
| 237 |
-
section_labels = [
|
| 238 |
-
s.get("type", "").lower() for s in dolphin_sections
|
| 239 |
-
]
|
| 240 |
-
for sec in signature["sections"]:
|
| 241 |
-
if any(sec in label for label in section_labels):
|
| 242 |
-
dolphin_bonus += 0.5
|
| 243 |
-
|
| 244 |
-
# Check minimum thresholds
|
| 245 |
-
min_kw = signature["min_keyword_matches"]
|
| 246 |
-
min_sec = signature["min_section_matches"]
|
| 247 |
-
|
| 248 |
-
if keyword_hits < min_kw and section_hits < min_sec:
|
| 249 |
-
return 0.0, matched_sections
|
| 250 |
-
|
| 251 |
-
# Weighted score: keywords × 2 + sections × 1.5 + dolphin bonus
|
| 252 |
-
score = (keyword_hits * 2.0) + (section_hits * 1.5) + dolphin_bonus
|
| 253 |
-
|
| 254 |
-
return score, matched_sections
|
| 255 |
-
|
| 256 |
-
@staticmethod
|
| 257 |
-
def get_financial_statement_types(classification: DocumentClassification) -> List[str]:
|
| 258 |
-
"""
|
| 259 |
-
Return the list of financial statement types that should be
|
| 260 |
-
extracted from this document.
|
| 261 |
-
|
| 262 |
-
For a 10-K/10-Q, extract all three statements.
|
| 263 |
-
For a standalone statement, extract just that one.
|
| 264 |
-
"""
|
| 265 |
-
comprehensive_types = {"10-K", "10-Q", "general_financial"}
|
| 266 |
-
|
| 267 |
-
if classification.doc_type in comprehensive_types:
|
| 268 |
-
return ["income", "balance", "cash_flow"]
|
| 269 |
-
|
| 270 |
-
type_map = {
|
| 271 |
-
"income_statement": ["income"],
|
| 272 |
-
"balance_sheet": ["balance"],
|
| 273 |
-
"cash_flow_statement": ["cash_flow"],
|
| 274 |
-
"bank_statement": ["cash_flow"],
|
| 275 |
-
"invoice": ["income"],
|
| 276 |
-
"tax_return": ["income"],
|
| 277 |
-
}
|
| 278 |
-
|
| 279 |
-
base = type_map.get(classification.doc_type, ["income", "balance", "cash_flow"])
|
| 280 |
-
|
| 281 |
-
# Add any secondary types detected
|
| 282 |
-
for sec_type in classification.secondary_types:
|
| 283 |
-
extra = type_map.get(sec_type, [])
|
| 284 |
-
for e in extra:
|
| 285 |
-
if e not in base:
|
| 286 |
-
base.append(e)
|
| 287 |
-
|
| 288 |
-
return base
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/dolphin/extractor.py
DELETED
|
@@ -1,336 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Dolphin Extractor — Extracts structured financial data from Dolphin's parsed output.
|
| 3 |
-
|
| 4 |
-
Converts Dolphin's Markdown/JSON tables and text elements into
|
| 5 |
-
key-value financial data using the existing DataMapper.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import re
|
| 9 |
-
import logging
|
| 10 |
-
from typing import Dict, List, Any, Optional
|
| 11 |
-
|
| 12 |
-
logger = logging.getLogger(__name__)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
class DolphinExtractor:
|
| 16 |
-
"""
|
| 17 |
-
Extracts financial data from Dolphin's parsed output.
|
| 18 |
-
|
| 19 |
-
Works with DolphinPageResult and DolphinElement objects to produce
|
| 20 |
-
a flat dict of {field_name: value} pairs ready for FinancialReport
|
| 21 |
-
construction.
|
| 22 |
-
|
| 23 |
-
Usage:
|
| 24 |
-
extractor = DolphinExtractor()
|
| 25 |
-
data = extractor.extract(dolphin_result, doc_classification)
|
| 26 |
-
"""
|
| 27 |
-
|
| 28 |
-
@staticmethod
|
| 29 |
-
def extract(
|
| 30 |
-
dolphin_result, # DolphinDocumentResult
|
| 31 |
-
doc_classification=None, # DocumentClassification
|
| 32 |
-
) -> Dict[str, Any]:
|
| 33 |
-
"""
|
| 34 |
-
Extract all financial data from a Dolphin document result.
|
| 35 |
-
|
| 36 |
-
Args:
|
| 37 |
-
dolphin_result: DolphinDocumentResult from client.parse_document()
|
| 38 |
-
doc_classification: Optional classification to guide extraction
|
| 39 |
-
|
| 40 |
-
Returns:
|
| 41 |
-
Dict of {standardized_field_name: float_value}
|
| 42 |
-
"""
|
| 43 |
-
from app.services.ingestion.mappings import DataMapper
|
| 44 |
-
|
| 45 |
-
extracted = {}
|
| 46 |
-
tables_data = []
|
| 47 |
-
text_content_parts = []
|
| 48 |
-
|
| 49 |
-
for page in dolphin_result.pages:
|
| 50 |
-
for element in page.elements:
|
| 51 |
-
if element.element_type == "table":
|
| 52 |
-
table_rows = DolphinExtractor._parse_markdown_table(
|
| 53 |
-
element.content
|
| 54 |
-
)
|
| 55 |
-
tables_data.append(table_rows)
|
| 56 |
-
elif element.element_type == "text":
|
| 57 |
-
text_content_parts.append(element.content)
|
| 58 |
-
|
| 59 |
-
# --- Strategy 1: Table Extraction ---
|
| 60 |
-
for table_rows in tables_data:
|
| 61 |
-
table_data = DolphinExtractor._extract_from_table_rows(
|
| 62 |
-
table_rows, DataMapper
|
| 63 |
-
)
|
| 64 |
-
# Only overwrite if we haven't seen this field yet
|
| 65 |
-
for k, v in table_data.items():
|
| 66 |
-
if k not in extracted:
|
| 67 |
-
extracted[k] = v
|
| 68 |
-
|
| 69 |
-
# --- Strategy 2: Text/Regex Extraction from Dolphin output ---
|
| 70 |
-
full_text = "\n".join(text_content_parts)
|
| 71 |
-
if full_text:
|
| 72 |
-
text_data = DolphinExtractor._extract_from_text(full_text, DataMapper)
|
| 73 |
-
for k, v in text_data.items():
|
| 74 |
-
if k not in extracted:
|
| 75 |
-
extracted[k] = v
|
| 76 |
-
|
| 77 |
-
# --- Strategy 3: Full Markdown extraction (catch-all) ---
|
| 78 |
-
if dolphin_result.full_markdown:
|
| 79 |
-
markdown_data = DolphinExtractor._extract_from_text(
|
| 80 |
-
dolphin_result.full_markdown, DataMapper
|
| 81 |
-
)
|
| 82 |
-
for k, v in markdown_data.items():
|
| 83 |
-
if k not in extracted:
|
| 84 |
-
extracted[k] = v
|
| 85 |
-
|
| 86 |
-
logger.info(
|
| 87 |
-
f"Dolphin extracted {len(extracted)} fields from "
|
| 88 |
-
f"{len(tables_data)} tables and {len(text_content_parts)} text blocks"
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
return extracted
|
| 92 |
-
|
| 93 |
-
@staticmethod
|
| 94 |
-
def extract_company_name(dolphin_result) -> Optional[str]:
|
| 95 |
-
"""
|
| 96 |
-
Attempt to extract company name from Dolphin's parsed output.
|
| 97 |
-
|
| 98 |
-
Looks for SEC filing patterns, document headers, and prominent text.
|
| 99 |
-
"""
|
| 100 |
-
if not dolphin_result.pages:
|
| 101 |
-
return None
|
| 102 |
-
|
| 103 |
-
# Check first page(s) for company name patterns
|
| 104 |
-
for page in dolphin_result.pages[:2]:
|
| 105 |
-
markdown = page.markdown
|
| 106 |
-
if not markdown:
|
| 107 |
-
continue
|
| 108 |
-
|
| 109 |
-
# SEC Filing: "Exact name of registrant as specified in its charter"
|
| 110 |
-
registrant_match = re.search(
|
| 111 |
-
r"(?:exact\s+name\s+of\s+registrant|registrant)",
|
| 112 |
-
markdown,
|
| 113 |
-
re.IGNORECASE,
|
| 114 |
-
)
|
| 115 |
-
if registrant_match:
|
| 116 |
-
# Look for prominent text before this marker
|
| 117 |
-
lines = markdown[: registrant_match.start()].strip().split("\n")
|
| 118 |
-
for line in reversed(lines[-10:]):
|
| 119 |
-
candidate = line.strip().strip("#").strip("*").strip()
|
| 120 |
-
if (
|
| 121 |
-
len(candidate) > 2
|
| 122 |
-
and not _is_boilerplate(candidate)
|
| 123 |
-
and any(c.isalpha() for c in candidate)
|
| 124 |
-
):
|
| 125 |
-
return candidate[:100]
|
| 126 |
-
|
| 127 |
-
# Markdown heading on first page
|
| 128 |
-
heading_match = re.search(r"^#+\s+(.+)$", markdown, re.MULTILINE)
|
| 129 |
-
if heading_match:
|
| 130 |
-
candidate = heading_match.group(1).strip()
|
| 131 |
-
if len(candidate) > 2 and not _is_boilerplate(candidate):
|
| 132 |
-
return candidate[:100]
|
| 133 |
-
|
| 134 |
-
# First non-trivial line
|
| 135 |
-
for line in markdown.split("\n")[:30]:
|
| 136 |
-
candidate = line.strip().strip("#").strip("*").strip()
|
| 137 |
-
if (
|
| 138 |
-
len(candidate) > 3
|
| 139 |
-
and not _is_boilerplate(candidate)
|
| 140 |
-
and any(c.isalpha() for c in candidate)
|
| 141 |
-
):
|
| 142 |
-
return candidate[:100]
|
| 143 |
-
|
| 144 |
-
return None
|
| 145 |
-
|
| 146 |
-
@staticmethod
|
| 147 |
-
def extract_fiscal_year(dolphin_result) -> Optional[str]:
|
| 148 |
-
"""Extract fiscal year/period from Dolphin output."""
|
| 149 |
-
if not dolphin_result.full_markdown:
|
| 150 |
-
return None
|
| 151 |
-
|
| 152 |
-
patterns = [
|
| 153 |
-
r"(?:YEAR|PERIOD|FISCAL\s+YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
|
| 154 |
-
r"(?:for\s+the\s+year\s+ended)\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
|
| 155 |
-
r"DECEMBER\s+31,\s+(\d{4})",
|
| 156 |
-
r"(\d{4})\s+(?:annual|fiscal)",
|
| 157 |
-
]
|
| 158 |
-
|
| 159 |
-
text = dolphin_result.full_markdown[:5000]
|
| 160 |
-
for pattern in patterns:
|
| 161 |
-
match = re.search(pattern, text, re.IGNORECASE)
|
| 162 |
-
if match:
|
| 163 |
-
return match.group(1)
|
| 164 |
-
|
| 165 |
-
return None
|
| 166 |
-
|
| 167 |
-
# ------------------------------------------------------------------
|
| 168 |
-
# Internal helpers
|
| 169 |
-
# ------------------------------------------------------------------
|
| 170 |
-
|
| 171 |
-
@staticmethod
|
| 172 |
-
def _parse_markdown_table(table_text: str) -> List[List[str]]:
|
| 173 |
-
"""
|
| 174 |
-
Parse a Markdown-format table into a list of rows.
|
| 175 |
-
|
| 176 |
-
Handles:
|
| 177 |
-
| Header1 | Header2 |
|
| 178 |
-
|---------|---------|
|
| 179 |
-
| val1 | val2 |
|
| 180 |
-
"""
|
| 181 |
-
rows = []
|
| 182 |
-
for line in table_text.strip().split("\n"):
|
| 183 |
-
line = line.strip()
|
| 184 |
-
if not line.startswith("|"):
|
| 185 |
-
continue
|
| 186 |
-
# Skip separator rows (|---|---|)
|
| 187 |
-
if all(re.match(r"^[\s\-:]+$", c) for c in line.split("|") if c.strip()):
|
| 188 |
-
continue
|
| 189 |
-
|
| 190 |
-
cells = [cell.strip() for cell in line.split("|")]
|
| 191 |
-
# Remove empty first/last from leading/trailing pipes
|
| 192 |
-
cells = [c for c in cells if c != ""]
|
| 193 |
-
if cells:
|
| 194 |
-
rows.append(cells)
|
| 195 |
-
|
| 196 |
-
return rows
|
| 197 |
-
|
| 198 |
-
@staticmethod
|
| 199 |
-
def _extract_from_table_rows(
|
| 200 |
-
rows: List[List[str]], data_mapper
|
| 201 |
-
) -> Dict[str, float]:
|
| 202 |
-
"""
|
| 203 |
-
Extract financial data from parsed table rows using DataMapper.
|
| 204 |
-
|
| 205 |
-
Assumes first column is label, remaining columns are values.
|
| 206 |
-
Picks the most recent year column if years are detected in headers.
|
| 207 |
-
"""
|
| 208 |
-
if not rows:
|
| 209 |
-
return {}
|
| 210 |
-
|
| 211 |
-
data = {}
|
| 212 |
-
|
| 213 |
-
# Detect target value column (most recent year)
|
| 214 |
-
target_col = _find_target_column(rows)
|
| 215 |
-
|
| 216 |
-
# Detect scale multiplier from header text
|
| 217 |
-
multiplier = 1.0
|
| 218 |
-
header_text = " ".join(" ".join(r) for r in rows[:3]).lower()
|
| 219 |
-
if re.search(r"in millions|amounts in millions", header_text):
|
| 220 |
-
multiplier = 1_000_000.0
|
| 221 |
-
elif re.search(r"in thousands|amounts in thousands|\(in 000s\)", header_text):
|
| 222 |
-
multiplier = 1_000.0
|
| 223 |
-
|
| 224 |
-
for row in rows:
|
| 225 |
-
if len(row) < 2:
|
| 226 |
-
continue
|
| 227 |
-
|
| 228 |
-
label = row[0]
|
| 229 |
-
mapped_field = data_mapper.map_row(label)
|
| 230 |
-
if not mapped_field:
|
| 231 |
-
continue
|
| 232 |
-
|
| 233 |
-
# Get value from target column or first numeric column
|
| 234 |
-
val = None
|
| 235 |
-
if target_col is not None and target_col < len(row):
|
| 236 |
-
val = _clean_financial_value(row[target_col])
|
| 237 |
-
|
| 238 |
-
if val is None:
|
| 239 |
-
for cell in row[1:]:
|
| 240 |
-
val = _clean_financial_value(cell)
|
| 241 |
-
if val is not None:
|
| 242 |
-
break
|
| 243 |
-
|
| 244 |
-
if val is not None:
|
| 245 |
-
data[mapped_field] = val * multiplier
|
| 246 |
-
|
| 247 |
-
return data
|
| 248 |
-
|
| 249 |
-
@staticmethod
|
| 250 |
-
def _extract_from_text(
|
| 251 |
-
text: str, data_mapper
|
| 252 |
-
) -> Dict[str, float]:
|
| 253 |
-
"""
|
| 254 |
-
Regex-based extraction from unstructured text.
|
| 255 |
-
|
| 256 |
-
Catches line items in formats like:
|
| 257 |
-
Revenue ............... $1,234,567
|
| 258 |
-
Net Income (456,789)
|
| 259 |
-
"""
|
| 260 |
-
data = {}
|
| 261 |
-
|
| 262 |
-
for field, aliases in data_mapper.FIELD_MAPPING.items():
|
| 263 |
-
if field in data:
|
| 264 |
-
continue
|
| 265 |
-
|
| 266 |
-
for alias in aliases:
|
| 267 |
-
pattern = re.compile(
|
| 268 |
-
rf"{re.escape(alias)}[^0-9\-]*?(\(?[\d,]+\.?\d*\)?)",
|
| 269 |
-
re.IGNORECASE,
|
| 270 |
-
)
|
| 271 |
-
match = pattern.search(text)
|
| 272 |
-
if match:
|
| 273 |
-
val = _clean_financial_value(match.group(1))
|
| 274 |
-
if val is not None:
|
| 275 |
-
data[field] = val
|
| 276 |
-
break
|
| 277 |
-
|
| 278 |
-
return data
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
# ---------------------------------------------------------------------------
|
| 282 |
-
# Module-level utility functions
|
| 283 |
-
# ---------------------------------------------------------------------------
|
| 284 |
-
|
| 285 |
-
def _find_target_column(rows: List[List[str]]) -> Optional[int]:
|
| 286 |
-
"""Find the column index containing the most recent year."""
|
| 287 |
-
max_year = 0
|
| 288 |
-
target_col = None
|
| 289 |
-
|
| 290 |
-
for row in rows[:5]: # Check headers
|
| 291 |
-
for idx, cell in enumerate(row):
|
| 292 |
-
cell_clean = cell.replace("$", "").strip()
|
| 293 |
-
if re.match(r"^\d{4}$", cell_clean):
|
| 294 |
-
year = int(cell_clean)
|
| 295 |
-
if 2000 < year < 2100 and year > max_year:
|
| 296 |
-
max_year = year
|
| 297 |
-
target_col = idx
|
| 298 |
-
|
| 299 |
-
return target_col
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
def _clean_financial_value(val_str: Optional[str]) -> Optional[float]:
|
| 303 |
-
"""Convert financial string formats to float."""
|
| 304 |
-
if not val_str:
|
| 305 |
-
return None
|
| 306 |
-
|
| 307 |
-
s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
|
| 308 |
-
if not s:
|
| 309 |
-
return None
|
| 310 |
-
|
| 311 |
-
# Handle parentheses as negative: (123) → -123
|
| 312 |
-
if "(" in s and ")" in s:
|
| 313 |
-
s = s.replace("(", "-").replace(")", "")
|
| 314 |
-
|
| 315 |
-
# Handle em-dash or dash as zero
|
| 316 |
-
if s in ("-", "—", "–"):
|
| 317 |
-
return 0.0
|
| 318 |
-
|
| 319 |
-
try:
|
| 320 |
-
return float(s)
|
| 321 |
-
except ValueError:
|
| 322 |
-
return None
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
_BOILERPLATE_PHRASES = {
|
| 326 |
-
"table of contents", "contents", "index", "financial statements",
|
| 327 |
-
"consolidated financial statements", "annual report", "quarterly report",
|
| 328 |
-
"10-k", "10-q", "form 10-k", "form 10-q", "united states",
|
| 329 |
-
"securities and exchange commission", "washington", "d.c.",
|
| 330 |
-
"commission file number", "transition report",
|
| 331 |
-
}
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
def _is_boilerplate(text: str) -> bool:
|
| 335 |
-
"""Check if text is a common boilerplate heading."""
|
| 336 |
-
return text.strip().lower() in _BOILERPLATE_PHRASES or text.strip().isdigit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/dolphin/remote_client.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Remote Dolphin Client — Consumes the Dolphin-as-a-Service API.
|
| 3 |
-
|
| 4 |
-
Sends PDF files to the external AI Worker (Hugging Face Space)
|
| 5 |
-
and receives structured extraction results.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import httpx
|
| 10 |
-
import logging
|
| 11 |
-
from typing import Optional, Dict, Any, List
|
| 12 |
-
from dataclasses import asdict
|
| 13 |
-
|
| 14 |
-
from app.core.config import settings
|
| 15 |
-
from app.services.ingestion.dolphin.client import (
|
| 16 |
-
DolphinDocumentResult,
|
| 17 |
-
DolphinPageResult,
|
| 18 |
-
DolphinLayoutResult,
|
| 19 |
-
DolphinElement,
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
logger = logging.getLogger(__name__)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
class RemoteDolphinClient:
|
| 26 |
-
"""
|
| 27 |
-
Client for the remote Dolphin AI worker service.
|
| 28 |
-
|
| 29 |
-
Usage:
|
| 30 |
-
client = RemoteDolphinClient(api_url="https://hf.space/...", api_key="...")
|
| 31 |
-
result = client.parse_document("report.pdf")
|
| 32 |
-
"""
|
| 33 |
-
|
| 34 |
-
def __init__(
|
| 35 |
-
self,
|
| 36 |
-
api_url: Optional[str] = None,
|
| 37 |
-
api_key: Optional[str] = None,
|
| 38 |
-
timeout: int = 300, # 5 minutes for large PDFs
|
| 39 |
-
):
|
| 40 |
-
self.api_url = (api_url or settings.DOLPHIN_API_URL).rstrip("/")
|
| 41 |
-
self.api_key = api_key or settings.DOLPHIN_API_KEY
|
| 42 |
-
self.timeout = timeout
|
| 43 |
-
|
| 44 |
-
if not self.api_url:
|
| 45 |
-
raise ValueError("DOLPHIN_API_URL must be set for RemoteDolphinClient")
|
| 46 |
-
|
| 47 |
-
logger.info(f"Initialized RemoteDolphinClient pointing to {self.api_url}")
|
| 48 |
-
|
| 49 |
-
def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
|
| 50 |
-
"""
|
| 51 |
-
Send PDF to remote worker and reconstruct the result object.
|
| 52 |
-
"""
|
| 53 |
-
if not os.path.exists(pdf_path):
|
| 54 |
-
logger.error(f"PDF not found: {pdf_path}")
|
| 55 |
-
return DolphinDocumentResult(total_pages=0)
|
| 56 |
-
|
| 57 |
-
url = f"{self.api_url}/process"
|
| 58 |
-
headers = {}
|
| 59 |
-
if self.api_key:
|
| 60 |
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
| 61 |
-
|
| 62 |
-
try:
|
| 63 |
-
logger.info(f"Sending {pdf_path} to remote Dolphin worker...")
|
| 64 |
-
|
| 65 |
-
with open(pdf_path, "rb") as f:
|
| 66 |
-
files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
|
| 67 |
-
|
| 68 |
-
with httpx.Client(timeout=self.timeout) as client:
|
| 69 |
-
response = client.post(url, files=files, headers=headers)
|
| 70 |
-
response.raise_for_status()
|
| 71 |
-
|
| 72 |
-
data = response.json()
|
| 73 |
-
return self._reconstruct_result(data)
|
| 74 |
-
|
| 75 |
-
except httpx.HTTPStatusError as e:
|
| 76 |
-
logger.error(f"Remote Dolphin API error: {e.response.text}")
|
| 77 |
-
raise RuntimeError(f"Dolphin API failed: {e.response.status_code}") from e
|
| 78 |
-
except Exception as e:
|
| 79 |
-
logger.error(f"Remote Dolphin client failed: {e}")
|
| 80 |
-
raise
|
| 81 |
-
|
| 82 |
-
def _reconstruct_result(self, data: Dict[str, Any]) -> DolphinDocumentResult:
|
| 83 |
-
"""Convert JSON response back to DolphinDocumentResult objects."""
|
| 84 |
-
pages = []
|
| 85 |
-
for p in data.get("pages", []):
|
| 86 |
-
elements = [
|
| 87 |
-
DolphinElement(**e) for e in p.get("elements", [])
|
| 88 |
-
]
|
| 89 |
-
pages.append(DolphinPageResult(
|
| 90 |
-
page_number=p["page_number"],
|
| 91 |
-
markdown=p["markdown"],
|
| 92 |
-
structured_json=p.get("structured_json", {}),
|
| 93 |
-
elements=elements,
|
| 94 |
-
))
|
| 95 |
-
|
| 96 |
-
layouts = []
|
| 97 |
-
for l in data.get("layouts", []):
|
| 98 |
-
layouts.append(DolphinLayoutResult(
|
| 99 |
-
page_number=l["page_number"],
|
| 100 |
-
sections=l.get("sections", []),
|
| 101 |
-
reading_order=l.get("reading_order", []),
|
| 102 |
-
doc_type_hint=l.get("doc_type_hint", "unknown"),
|
| 103 |
-
))
|
| 104 |
-
|
| 105 |
-
return DolphinDocumentResult(
|
| 106 |
-
pages=pages,
|
| 107 |
-
layouts=layouts,
|
| 108 |
-
full_markdown=data.get("full_markdown", ""),
|
| 109 |
-
total_pages=data.get("total_pages", 0),
|
| 110 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/mappings.py
DELETED
|
@@ -1,315 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Data Mapper - Field name normalization for financial data.
|
| 3 |
-
|
| 4 |
-
Maps various field names from different file formats (CSV, PDF, XLSX)
|
| 5 |
-
to standardized internal field names.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from typing import Dict, List, Optional, Tuple
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class DataMapper:
|
| 12 |
-
"""
|
| 13 |
-
Maps raw field names to standardized internal field names.
|
| 14 |
-
|
| 15 |
-
Usage:
|
| 16 |
-
field = DataMapper.map_row("Total Revenue") # Returns "revenue"
|
| 17 |
-
field = DataMapper.map_row("Accounts Receivable") # Returns "accounts_receivable"
|
| 18 |
-
"""
|
| 19 |
-
|
| 20 |
-
FIELD_MAPPING: Dict[str, List[str]] = {
|
| 21 |
-
# =================================================================
|
| 22 |
-
# INCOME STATEMENT
|
| 23 |
-
# =================================================================
|
| 24 |
-
"revenue": [
|
| 25 |
-
"revenue", "sales", "gross sales", "total revenue", "net sales",
|
| 26 |
-
"total net sales", "net revenue", "total sales", "service revenue",
|
| 27 |
-
"product revenue", "subscription revenue", "recurring revenue",
|
| 28 |
-
"operating revenue", "revenues, net", "revenues"
|
| 29 |
-
],
|
| 30 |
-
"cogs": [
|
| 31 |
-
"cogs", "cost of goods", "direct costs", "cost of sales",
|
| 32 |
-
"cost of revenue", "cost of goods sold", "cost of products sold",
|
| 33 |
-
"cost of services", "direct cost", "cost of merchandise"
|
| 34 |
-
],
|
| 35 |
-
"marketing_expenses": [
|
| 36 |
-
"marketing", "ad spend", "advertising", "marketing expense",
|
| 37 |
-
"promotion", "marketing and advertising", "advertising expense",
|
| 38 |
-
"marketing costs", "promotional expense", "customer acquisition"
|
| 39 |
-
],
|
| 40 |
-
"payroll_expenses": [
|
| 41 |
-
"payroll", "salaries", "wages", "employee costs", "personnel",
|
| 42 |
-
"labor", "compensation", "salaries and wages", "employee benefits",
|
| 43 |
-
"stock compensation", "share-based compensation", "labor cost",
|
| 44 |
-
"wages and salaries", "staff costs"
|
| 45 |
-
],
|
| 46 |
-
"rent_expense": [
|
| 47 |
-
"rent", "lease", "occupancy", "facilities", "rent expense",
|
| 48 |
-
"lease expense", "occupancy costs", "facility costs"
|
| 49 |
-
],
|
| 50 |
-
"other_operating_expenses": [
|
| 51 |
-
"other expense", "operating expense", "sga", "general and administrative",
|
| 52 |
-
"g&a", "selling, general", "pre-opening", "impairment",
|
| 53 |
-
"administrative expense", "operating expenses", "other operating",
|
| 54 |
-
"research and development", "r&d", "utilities", "insurance"
|
| 55 |
-
],
|
| 56 |
-
"depreciation": [
|
| 57 |
-
"depreciation", "depreciation expense", "depreciation and amortization"
|
| 58 |
-
],
|
| 59 |
-
"amortization": [
|
| 60 |
-
"amortization", "amortization expense"
|
| 61 |
-
],
|
| 62 |
-
"interest_expense": [
|
| 63 |
-
"interest", "interest expense", "finance costs", "interest cost",
|
| 64 |
-
"interest and finance charges", "borrowing costs"
|
| 65 |
-
],
|
| 66 |
-
"taxes": [
|
| 67 |
-
"tax", "income tax", "taxes", "provision for taxes", "income tax expense",
|
| 68 |
-
"tax expense", "provision for income taxes"
|
| 69 |
-
],
|
| 70 |
-
|
| 71 |
-
# =================================================================
|
| 72 |
-
# BALANCE SHEET - ASSETS
|
| 73 |
-
# =================================================================
|
| 74 |
-
"cash": [
|
| 75 |
-
"cash", "bank", "cash and equivalents", "cash & equivalents",
|
| 76 |
-
"cash and cash equivalents", "cash on hand", "short-term investments",
|
| 77 |
-
"cash, cash equivalents"
|
| 78 |
-
],
|
| 79 |
-
"accounts_receivable": [
|
| 80 |
-
"accounts receivable", "ar", "receivables", "trade receivables",
|
| 81 |
-
"net receivables", "receivables, net", "trade accounts receivable"
|
| 82 |
-
],
|
| 83 |
-
"inventory": [
|
| 84 |
-
"inventory", "stock", "merchandise", "inventories",
|
| 85 |
-
"merchandise inventory", "raw materials"
|
| 86 |
-
],
|
| 87 |
-
"prepaid_expenses": [
|
| 88 |
-
"prepaid", "prepaid expenses", "other current assets",
|
| 89 |
-
"prepaid and other", "prepaids"
|
| 90 |
-
],
|
| 91 |
-
"property_plant_equipment": [
|
| 92 |
-
"ppe", "fixed assets", "property plant equipment", "equipment",
|
| 93 |
-
"property, plant and equipment", "property and equipment",
|
| 94 |
-
"net property", "fixed assets, net", "capital assets"
|
| 95 |
-
],
|
| 96 |
-
"accumulated_depreciation": [
|
| 97 |
-
"accumulated depreciation", "acc depreciation", "less depreciation"
|
| 98 |
-
],
|
| 99 |
-
"intangible_assets": [
|
| 100 |
-
"intangible assets", "goodwill", "soft assets", "intangibles",
|
| 101 |
-
"goodwill and intangibles"
|
| 102 |
-
],
|
| 103 |
-
|
| 104 |
-
# =================================================================
|
| 105 |
-
# BALANCE SHEET - LIABILITIES
|
| 106 |
-
# =================================================================
|
| 107 |
-
"accounts_payable": [
|
| 108 |
-
"accounts payable", "ap", "payables", "trade payables",
|
| 109 |
-
"trade accounts payable"
|
| 110 |
-
],
|
| 111 |
-
"accrued_liabilities": [
|
| 112 |
-
"accrued liabilities", "accrued expenses", "accruals",
|
| 113 |
-
"accrued and other"
|
| 114 |
-
],
|
| 115 |
-
"short_term_debt": [
|
| 116 |
-
"short term debt", "current portion of debt", "notes payable",
|
| 117 |
-
"current debt", "short-term borrowings", "current portion of long-term debt"
|
| 118 |
-
],
|
| 119 |
-
"long_term_debt": [
|
| 120 |
-
"long term debt", "term loan", "non-current liabilities",
|
| 121 |
-
"long-term borrowings", "bonds payable", "notes payable long-term"
|
| 122 |
-
],
|
| 123 |
-
"deferred_revenue": [
|
| 124 |
-
"deferred revenue", "unearned revenue", "contract liabilities",
|
| 125 |
-
"deferred income"
|
| 126 |
-
],
|
| 127 |
-
"total_equity": [
|
| 128 |
-
"equity", "retained earnings", "shareholders equity", "total equity",
|
| 129 |
-
"stockholders equity", "shareholders' equity", "stockholders' equity",
|
| 130 |
-
"total shareholders equity", "net worth", "owner equity"
|
| 131 |
-
],
|
| 132 |
-
|
| 133 |
-
# =================================================================
|
| 134 |
-
# CASH FLOW STATEMENT
|
| 135 |
-
# =================================================================
|
| 136 |
-
"operating_cash_flow": [
|
| 137 |
-
"operating cash flow", "cfo", "cash from operations",
|
| 138 |
-
"cash flow from operating activities", "net cash from operating",
|
| 139 |
-
"cash generated by operating activities", "operating activities",
|
| 140 |
-
"net cash provided by operating", "cash flows from operating"
|
| 141 |
-
],
|
| 142 |
-
"capex": [
|
| 143 |
-
"capex", "capital expenditure", "purchase of property",
|
| 144 |
-
"additions to property", "capital expenditures",
|
| 145 |
-
"purchases of property", "property additions"
|
| 146 |
-
],
|
| 147 |
-
"investing_cash_flow": [
|
| 148 |
-
"investing cash flow", "cash from investing",
|
| 149 |
-
"cash flow from investing activities", "investing activities",
|
| 150 |
-
"net cash from investing", "cash flows from investing"
|
| 151 |
-
],
|
| 152 |
-
"financing_cash_flow": [
|
| 153 |
-
"financing cash flow", "cash from financing",
|
| 154 |
-
"cash flow from financing activities", "financing activities",
|
| 155 |
-
"net cash from financing", "cash flows from financing"
|
| 156 |
-
],
|
| 157 |
-
|
| 158 |
-
# =================================================================
|
| 159 |
-
# OPERATING METRICS
|
| 160 |
-
# =================================================================
|
| 161 |
-
"new_customers": ["new customers", "customer additions", "new users"],
|
| 162 |
-
"total_transactions": ["transactions", "orders", "total orders"],
|
| 163 |
-
"total_seats": ["seats", "licenses", "subscriptions"],
|
| 164 |
-
"active_members": ["members", "active count", "active users"],
|
| 165 |
-
"restaurant_margin": ["restaurant margin", "store margin"],
|
| 166 |
-
"effective_tax_rate": ["effective tax rate", "tax rate"],
|
| 167 |
-
"churn_rate": ["churn", "churn rate", "attrition", "cancellation rate"],
|
| 168 |
-
"cac": ["cac", "acquisition cost", "customer acquisition cost"],
|
| 169 |
-
"ltv": ["ltv", "lifetime value", "cltv", "customer lifetime value"],
|
| 170 |
-
|
| 171 |
-
# =================================================================
|
| 172 |
-
# DERIVED / SUMMARY ITEMS (often in Excel templates)
|
| 173 |
-
# =================================================================
|
| 174 |
-
"gross_profit": [
|
| 175 |
-
"gross profit", "gross margin", "gross income"
|
| 176 |
-
],
|
| 177 |
-
"operating_income": [
|
| 178 |
-
"operating income", "operating profit", "ebit", "income from operations"
|
| 179 |
-
],
|
| 180 |
-
"net_income": [
|
| 181 |
-
"net income", "net profit", "net earnings", "net income attributable"
|
| 182 |
-
],
|
| 183 |
-
"ebitda": [
|
| 184 |
-
"ebitda", "earnings before interest"
|
| 185 |
-
],
|
| 186 |
-
"total_assets": [
|
| 187 |
-
"total assets", "assets total"
|
| 188 |
-
],
|
| 189 |
-
"total_liabilities": [
|
| 190 |
-
"total liabilities", "liabilities total"
|
| 191 |
-
],
|
| 192 |
-
}
|
| 193 |
-
|
| 194 |
-
# Exclusion rules: (field, [terms that should NOT trigger this field])
|
| 195 |
-
EXCLUSIONS: Dict[str, List[str]] = {
|
| 196 |
-
"revenue": ["cost", "marketable securities", "deferred"],
|
| 197 |
-
"total_equity": ["awards", "liability", "liabilities", "debt"],
|
| 198 |
-
"cash": ["non-cash", "noncash"],
|
| 199 |
-
"depreciation": ["accum", "accumulated"],
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
# Field categories for validation
|
| 203 |
-
INCOME_FIELDS = [
|
| 204 |
-
"revenue", "cogs", "marketing_expenses", "payroll_expenses", "rent_expense",
|
| 205 |
-
"other_operating_expenses", "depreciation", "amortization", "interest_expense", "taxes",
|
| 206 |
-
"gross_profit", "operating_income", "net_income", "ebitda"
|
| 207 |
-
]
|
| 208 |
-
|
| 209 |
-
BALANCE_FIELDS = [
|
| 210 |
-
"cash", "accounts_receivable", "inventory", "prepaid_expenses",
|
| 211 |
-
"property_plant_equipment", "accumulated_depreciation", "intangible_assets",
|
| 212 |
-
"accounts_payable", "accrued_liabilities", "short_term_debt", "long_term_debt",
|
| 213 |
-
"deferred_revenue", "total_equity", "total_assets", "total_liabilities"
|
| 214 |
-
]
|
| 215 |
-
|
| 216 |
-
CASH_FIELDS = [
|
| 217 |
-
"operating_cash_flow", "capex", "investing_cash_flow", "financing_cash_flow"
|
| 218 |
-
]
|
| 219 |
-
|
| 220 |
-
@staticmethod
|
| 221 |
-
def map_row(row_label: str) -> Optional[str]:
|
| 222 |
-
"""
|
| 223 |
-
Map a raw field label to a standardized field name.
|
| 224 |
-
|
| 225 |
-
Args:
|
| 226 |
-
row_label: The raw label from the source file
|
| 227 |
-
|
| 228 |
-
Returns:
|
| 229 |
-
Standardized field name, or None if no match found
|
| 230 |
-
"""
|
| 231 |
-
if not row_label:
|
| 232 |
-
return None
|
| 233 |
-
|
| 234 |
-
label_clean = str(row_label).lower().strip().replace("_", " ")
|
| 235 |
-
|
| 236 |
-
# Direct match check first
|
| 237 |
-
for field, aliases in DataMapper.FIELD_MAPPING.items():
|
| 238 |
-
if label_clean == field:
|
| 239 |
-
return field
|
| 240 |
-
|
| 241 |
-
# Fuzzy / keyword matching with longest match wins
|
| 242 |
-
best_match_field = None
|
| 243 |
-
best_match_len = 0
|
| 244 |
-
|
| 245 |
-
for field, aliases in DataMapper.FIELD_MAPPING.items():
|
| 246 |
-
for alias in aliases:
|
| 247 |
-
if alias in label_clean:
|
| 248 |
-
# Check exclusions
|
| 249 |
-
if field in DataMapper.EXCLUSIONS:
|
| 250 |
-
if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
|
| 251 |
-
continue
|
| 252 |
-
|
| 253 |
-
# Longest alias match wins (more specific)
|
| 254 |
-
if len(alias) > best_match_len:
|
| 255 |
-
best_match_len = len(alias)
|
| 256 |
-
best_match_field = field
|
| 257 |
-
|
| 258 |
-
return best_match_field
|
| 259 |
-
|
| 260 |
-
@staticmethod
|
| 261 |
-
def map_row_with_confidence(row_label: str) -> Tuple[Optional[str], float]:
|
| 262 |
-
"""
|
| 263 |
-
Map a row label and return confidence score.
|
| 264 |
-
|
| 265 |
-
Returns:
|
| 266 |
-
Tuple of (field_name, confidence) where confidence is 0.0-1.0
|
| 267 |
-
"""
|
| 268 |
-
if not row_label:
|
| 269 |
-
return None, 0.0
|
| 270 |
-
|
| 271 |
-
label_clean = str(row_label).lower().strip().replace("_", " ")
|
| 272 |
-
|
| 273 |
-
# Exact match = 1.0 confidence
|
| 274 |
-
for field, aliases in DataMapper.FIELD_MAPPING.items():
|
| 275 |
-
if label_clean == field:
|
| 276 |
-
return field, 1.0
|
| 277 |
-
for alias in aliases:
|
| 278 |
-
if label_clean == alias:
|
| 279 |
-
return field, 1.0
|
| 280 |
-
|
| 281 |
-
# Partial match = proportional confidence
|
| 282 |
-
best_match_field = None
|
| 283 |
-
best_confidence = 0.0
|
| 284 |
-
|
| 285 |
-
for field, aliases in DataMapper.FIELD_MAPPING.items():
|
| 286 |
-
for alias in aliases:
|
| 287 |
-
if alias in label_clean:
|
| 288 |
-
# Check exclusions
|
| 289 |
-
if field in DataMapper.EXCLUSIONS:
|
| 290 |
-
if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
|
| 291 |
-
continue
|
| 292 |
-
|
| 293 |
-
# Confidence based on how much of the label is matched
|
| 294 |
-
confidence = len(alias) / len(label_clean)
|
| 295 |
-
if confidence > best_confidence:
|
| 296 |
-
best_confidence = confidence
|
| 297 |
-
best_match_field = field
|
| 298 |
-
|
| 299 |
-
return best_match_field, min(best_confidence, 0.95) # Cap at 0.95 for non-exact
|
| 300 |
-
|
| 301 |
-
@staticmethod
|
| 302 |
-
def get_statement_type(field: str) -> Optional[str]:
|
| 303 |
-
"""
|
| 304 |
-
Determine which financial statement a field belongs to.
|
| 305 |
-
|
| 306 |
-
Returns:
|
| 307 |
-
"income", "balance", "cash_flow", or None
|
| 308 |
-
"""
|
| 309 |
-
if field in DataMapper.INCOME_FIELDS:
|
| 310 |
-
return "income"
|
| 311 |
-
elif field in DataMapper.BALANCE_FIELDS:
|
| 312 |
-
return "balance"
|
| 313 |
-
elif field in DataMapper.CASH_FIELDS:
|
| 314 |
-
return "cash_flow"
|
| 315 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/parser_csv.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import re
|
| 3 |
-
from typing import Dict, Any, Optional
|
| 4 |
-
from app.schemas.financial import (
|
| 5 |
-
FinancialReport,
|
| 6 |
-
BalanceSheetStandard,
|
| 7 |
-
IncomeStatementStandard,
|
| 8 |
-
CashFlowStandard,
|
| 9 |
-
OperatingMetrics,
|
| 10 |
-
PeriodType,
|
| 11 |
-
Currency
|
| 12 |
-
)
|
| 13 |
-
from datetime import date
|
| 14 |
-
|
| 15 |
-
from app.services.ingestion.mappings import DataMapper
|
| 16 |
-
|
| 17 |
-
class CSVParser:
|
| 18 |
-
@staticmethod
|
| 19 |
-
def parse(file_path: str) -> FinancialReport:
|
| 20 |
-
df = pd.read_csv(file_path)
|
| 21 |
-
|
| 22 |
-
# Logic to handle different CSV structures
|
| 23 |
-
# Case 1: Transposed (Item, Value)
|
| 24 |
-
# Case 2: Standard (Columns are periods, Rows are Items) -> We take the most recent column
|
| 25 |
-
|
| 26 |
-
data_dict = {}
|
| 27 |
-
|
| 28 |
-
# Check if columns themselves are headers (Horizontal Format)
|
| 29 |
-
# We look for at least 3 matching fields in columns to confirm
|
| 30 |
-
matches = 0
|
| 31 |
-
for col in df.columns:
|
| 32 |
-
if DataMapper.map_row(str(col)):
|
| 33 |
-
matches += 1
|
| 34 |
-
|
| 35 |
-
if matches >= 3:
|
| 36 |
-
# Horizontal Format: Take the last row (most recent data)
|
| 37 |
-
# Assumption: columns are fields
|
| 38 |
-
last_row = df.iloc[-1]
|
| 39 |
-
for col in df.columns:
|
| 40 |
-
field = DataMapper.map_row(str(col))
|
| 41 |
-
if field:
|
| 42 |
-
val_raw = last_row[col]
|
| 43 |
-
# Clean value
|
| 44 |
-
if isinstance(val_raw, str):
|
| 45 |
-
val_clean = re.sub(r'[^\d.-]', '', val_raw)
|
| 46 |
-
try: val = float(val_clean)
|
| 47 |
-
except: val = 0.0
|
| 48 |
-
else:
|
| 49 |
-
val = float(val_raw) if pd.notnull(val_raw) else 0.0
|
| 50 |
-
data_dict[field] = val
|
| 51 |
-
|
| 52 |
-
# Fallback to Vertical (Key-Value) Format
|
| 53 |
-
elif len(df.columns) >= 2:
|
| 54 |
-
# Assume col 0 is label, col 1 is current period value
|
| 55 |
-
for _, row in df.iterrows():
|
| 56 |
-
label = str(row[0])
|
| 57 |
-
# Try col 1, if nan try col 2? For now strict col 1
|
| 58 |
-
val_raw = row[1]
|
| 59 |
-
|
| 60 |
-
# Clean value
|
| 61 |
-
if isinstance(val_raw, str):
|
| 62 |
-
val_clean = re.sub(r'[^\d.-]', '', val_raw)
|
| 63 |
-
try: val = float(val_clean)
|
| 64 |
-
except: val = 0.0
|
| 65 |
-
else:
|
| 66 |
-
val = float(val_raw) if pd.notnull(val_raw) else 0.0
|
| 67 |
-
|
| 68 |
-
field = DataMapper.map_row(label)
|
| 69 |
-
if field:
|
| 70 |
-
data_dict[field] = val
|
| 71 |
-
|
| 72 |
-
def get(key, default=0.0):
|
| 73 |
-
return data_dict.get(key, default)
|
| 74 |
-
|
| 75 |
-
income = IncomeStatementStandard(
|
| 76 |
-
revenue=get("revenue"),
|
| 77 |
-
cogs=get("cogs"),
|
| 78 |
-
marketing_expenses=get("marketing_expenses"),
|
| 79 |
-
payroll_expenses=get("payroll_expenses"),
|
| 80 |
-
rent_expense=get("rent_expense"),
|
| 81 |
-
other_operating_expenses=get("other_operating_expenses"),
|
| 82 |
-
depreciation=get("depreciation"),
|
| 83 |
-
amortization=get("amortization"),
|
| 84 |
-
interest_expense=get("interest_expense"),
|
| 85 |
-
taxes=get("taxes")
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
balance = BalanceSheetStandard(
|
| 89 |
-
cash=get("cash"),
|
| 90 |
-
accounts_receivable=get("accounts_receivable"),
|
| 91 |
-
inventory=get("inventory"),
|
| 92 |
-
prepaid_expenses=get("prepaid_expenses"),
|
| 93 |
-
property_plant_equipment=get("property_plant_equipment"),
|
| 94 |
-
accumulated_depreciation=get("accumulated_depreciation"),
|
| 95 |
-
intangible_assets=get("intangible_assets"),
|
| 96 |
-
accounts_payable=get("accounts_payable"),
|
| 97 |
-
accrued_liabilities=get("accrued_liabilities"),
|
| 98 |
-
short_term_debt=get("short_term_debt"),
|
| 99 |
-
long_term_debt=get("long_term_debt"),
|
| 100 |
-
deferred_revenue=get("deferred_revenue"),
|
| 101 |
-
total_equity=get("total_equity")
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
cash_flow = CashFlowStandard(
|
| 105 |
-
operating_cash_flow=get("operating_cash_flow"),
|
| 106 |
-
capex=get("capex"),
|
| 107 |
-
investing_cash_flow=get("investing_cash_flow"),
|
| 108 |
-
financing_cash_flow=get("financing_cash_flow")
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
metrics = OperatingMetrics(
|
| 112 |
-
industry='general', # Default, could extract from metadata
|
| 113 |
-
new_customers=int(get("new_customers")) if get("new_customers") else None,
|
| 114 |
-
total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
|
| 115 |
-
total_seats=int(get("total_seats")) if get("total_seats") else None
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
return FinancialReport(
|
| 119 |
-
company_name="Imported Company",
|
| 120 |
-
period_end=date.today(),
|
| 121 |
-
period_type=PeriodType.ANNUAL,
|
| 122 |
-
currency=Currency.USD,
|
| 123 |
-
income_statement=income,
|
| 124 |
-
balance_sheet=balance,
|
| 125 |
-
cash_flow=cash_flow,
|
| 126 |
-
metrics=metrics
|
| 127 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/parser_dolphin.py
DELETED
|
@@ -1,429 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Hybrid PDF Parser — Combines Dolphin-v2 and pdfplumber for optimal extraction.
|
| 3 |
-
|
| 4 |
-
Both engines process every PDF:
|
| 5 |
-
Stage 1: Dolphin layout analysis (document structure & reading order)
|
| 6 |
-
Stage 2: Document classification (10-K, invoice, bank statement, etc.)
|
| 7 |
-
Stage 3: Dolphin element extraction (tables, text, formulas)
|
| 8 |
-
Stage 4: pdfplumber gap-fill & validation (tables + regex fallback)
|
| 9 |
-
Stage 5: Merge & normalize → FinancialReport
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import logging
|
| 13 |
-
import re
|
| 14 |
-
from typing import Dict, Any, Optional, List
|
| 15 |
-
from datetime import date
|
| 16 |
-
|
| 17 |
-
from app.schemas.financial import (
|
| 18 |
-
FinancialReport,
|
| 19 |
-
BalanceSheetStandard,
|
| 20 |
-
IncomeStatementStandard,
|
| 21 |
-
CashFlowStandard,
|
| 22 |
-
OperatingMetrics,
|
| 23 |
-
PeriodType,
|
| 24 |
-
Currency,
|
| 25 |
-
)
|
| 26 |
-
from app.services.ingestion.mappings import DataMapper
|
| 27 |
-
|
| 28 |
-
logger = logging.getLogger(__name__)
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class HybridPDFParser:
|
| 32 |
-
"""
|
| 33 |
-
Hybrid parser that combines Dolphin-v2 deep parsing with pdfplumber
|
| 34 |
-
gap-filling on every PDF for maximum extraction coverage.
|
| 35 |
-
|
| 36 |
-
Implements the same `parse(file_path) -> FinancialReport` interface
|
| 37 |
-
as the original PDFParser.
|
| 38 |
-
"""
|
| 39 |
-
|
| 40 |
-
@staticmethod
|
| 41 |
-
def parse(file_path: str) -> FinancialReport:
|
| 42 |
-
"""
|
| 43 |
-
Parse a PDF using the hybrid Dolphin + pdfplumber pipeline.
|
| 44 |
-
|
| 45 |
-
Stages:
|
| 46 |
-
1. Dolphin: layout + page parsing
|
| 47 |
-
2. Classify: determine document type
|
| 48 |
-
3. Dolphin: extract structured data from elements
|
| 49 |
-
4. pdfplumber: gap-fill with table + regex extraction
|
| 50 |
-
5. Merge: Dolphin data takes priority, pdfplumber fills gaps
|
| 51 |
-
|
| 52 |
-
Falls back to pdfplumber-only if Dolphin is unavailable.
|
| 53 |
-
"""
|
| 54 |
-
dolphin_data = {}
|
| 55 |
-
pdfplumber_data = {}
|
| 56 |
-
classification = None
|
| 57 |
-
dolphin_company_name = None
|
| 58 |
-
dolphin_fiscal_year = None
|
| 59 |
-
extraction_method = "pdfplumber"
|
| 60 |
-
|
| 61 |
-
# -----------------------------------------------------------------
|
| 62 |
-
# Stage 1-3: Dolphin Extraction
|
| 63 |
-
# -----------------------------------------------------------------
|
| 64 |
-
try:
|
| 65 |
-
from app.services.ingestion.dolphin import is_dolphin_available
|
| 66 |
-
|
| 67 |
-
if is_dolphin_available():
|
| 68 |
-
logger.info("Dolphin available — running hybrid extraction")
|
| 69 |
-
dolphin_data, classification, dolphin_company_name, dolphin_fiscal_year = (
|
| 70 |
-
HybridPDFParser._run_dolphin_stages(file_path)
|
| 71 |
-
)
|
| 72 |
-
extraction_method = "dolphin_hybrid"
|
| 73 |
-
else:
|
| 74 |
-
logger.info("Dolphin not available — pdfplumber-only mode")
|
| 75 |
-
except Exception as e:
|
| 76 |
-
logger.warning(f"Dolphin extraction failed, continuing with pdfplumber: {e}")
|
| 77 |
-
|
| 78 |
-
# -----------------------------------------------------------------
|
| 79 |
-
# Stage 4: pdfplumber Gap-Fill
|
| 80 |
-
# -----------------------------------------------------------------
|
| 81 |
-
pdfplumber_data, pdfplumber_text = HybridPDFParser._run_pdfplumber(file_path)
|
| 82 |
-
|
| 83 |
-
# -----------------------------------------------------------------
|
| 84 |
-
# Stage 5: Merge — Dolphin takes priority, pdfplumber fills gaps
|
| 85 |
-
# -----------------------------------------------------------------
|
| 86 |
-
merged_data = HybridPDFParser._merge_extractions(dolphin_data, pdfplumber_data)
|
| 87 |
-
|
| 88 |
-
logger.info(
|
| 89 |
-
f"Merged extraction: {len(dolphin_data)} Dolphin fields + "
|
| 90 |
-
f"{len(pdfplumber_data)} pdfplumber fields → "
|
| 91 |
-
f"{len(merged_data)} total fields"
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
# -----------------------------------------------------------------
|
| 95 |
-
# Build FinancialReport
|
| 96 |
-
# -----------------------------------------------------------------
|
| 97 |
-
return HybridPDFParser._build_report(
|
| 98 |
-
extracted_data=merged_data,
|
| 99 |
-
text_content=pdfplumber_text,
|
| 100 |
-
file_path=file_path,
|
| 101 |
-
extraction_method=extraction_method,
|
| 102 |
-
classification=classification,
|
| 103 |
-
dolphin_company_name=dolphin_company_name,
|
| 104 |
-
dolphin_fiscal_year=dolphin_fiscal_year,
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# ==================================================================
|
| 108 |
-
# Stage Implementations
|
| 109 |
-
# ==================================================================
|
| 110 |
-
|
| 111 |
-
@staticmethod
|
| 112 |
-
def _run_dolphin_stages(file_path: str):
|
| 113 |
-
"""Stages 1-3: Dolphin layout, classification, and extraction."""
|
| 114 |
-
from app.services.ingestion.dolphin.client import DolphinClient
|
| 115 |
-
from app.services.ingestion.dolphin.classifier import DocumentClassifier
|
| 116 |
-
from app.services.ingestion.dolphin.extractor import DolphinExtractor
|
| 117 |
-
|
| 118 |
-
# Stage 1: Parse entire document
|
| 119 |
-
# Use factory to get Local or Remote client
|
| 120 |
-
client = DolphinClient.create()
|
| 121 |
-
doc_result = client.parse_document(file_path)
|
| 122 |
-
|
| 123 |
-
if doc_result.total_pages == 0:
|
| 124 |
-
return {}, None, None, None
|
| 125 |
-
|
| 126 |
-
# Stage 2: Classify document type
|
| 127 |
-
# Collect section info from layouts
|
| 128 |
-
all_sections = []
|
| 129 |
-
for layout in doc_result.layouts:
|
| 130 |
-
all_sections.extend(layout.sections)
|
| 131 |
-
|
| 132 |
-
classification = DocumentClassifier.classify(
|
| 133 |
-
text_content=doc_result.full_markdown,
|
| 134 |
-
dolphin_sections=all_sections,
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
logger.info(
|
| 138 |
-
f"Document classified as '{classification.doc_type}' "
|
| 139 |
-
f"(confidence: {classification.confidence:.2f})"
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
# Stage 3: Extract structured financial data
|
| 143 |
-
extracted = DolphinExtractor.extract(doc_result, classification)
|
| 144 |
-
|
| 145 |
-
# Also try to extract company name and fiscal year
|
| 146 |
-
company_name = DolphinExtractor.extract_company_name(doc_result)
|
| 147 |
-
fiscal_year = DolphinExtractor.extract_fiscal_year(doc_result)
|
| 148 |
-
|
| 149 |
-
return extracted, classification, company_name, fiscal_year
|
| 150 |
-
|
| 151 |
-
@staticmethod
|
| 152 |
-
def _run_pdfplumber(file_path: str):
|
| 153 |
-
"""
|
| 154 |
-
Stage 4: pdfplumber extraction — tables + regex.
|
| 155 |
-
|
| 156 |
-
Reuses the proven logic from the existing PDFParser.
|
| 157 |
-
"""
|
| 158 |
-
from app.services.ingestion.parser_pdf import PDFParser
|
| 159 |
-
import pdfplumber
|
| 160 |
-
|
| 161 |
-
extracted_data = {}
|
| 162 |
-
text_content = ""
|
| 163 |
-
|
| 164 |
-
try:
|
| 165 |
-
with pdfplumber.open(file_path) as pdf:
|
| 166 |
-
# Statement page locator
|
| 167 |
-
statement_pages = PDFParser._find_statement_pages(pdf)
|
| 168 |
-
|
| 169 |
-
# Extract from identified statement pages
|
| 170 |
-
for stmt_type, page in statement_pages.items():
|
| 171 |
-
allowed_fields = None
|
| 172 |
-
if stmt_type == "income":
|
| 173 |
-
allowed_fields = DataMapper.INCOME_FIELDS
|
| 174 |
-
elif stmt_type == "balance":
|
| 175 |
-
allowed_fields = DataMapper.BALANCE_FIELDS
|
| 176 |
-
elif stmt_type == "cash_flow":
|
| 177 |
-
allowed_fields = DataMapper.CASH_FIELDS
|
| 178 |
-
|
| 179 |
-
table_data = PDFParser._extract_table_data(page, allowed_fields)
|
| 180 |
-
extracted_data.update(table_data)
|
| 181 |
-
|
| 182 |
-
# Full text extraction for regex fallback
|
| 183 |
-
for page in pdf.pages:
|
| 184 |
-
page_text = page.extract_text()
|
| 185 |
-
if page_text:
|
| 186 |
-
text_content += page_text + "\n"
|
| 187 |
-
|
| 188 |
-
# Regex fallback for missing fields
|
| 189 |
-
regex_data = PDFParser._extract_via_regex(
|
| 190 |
-
text_content, existing_keys=extracted_data.keys()
|
| 191 |
-
)
|
| 192 |
-
extracted_data.update(regex_data)
|
| 193 |
-
|
| 194 |
-
except Exception as e:
|
| 195 |
-
logger.warning(f"pdfplumber extraction failed: {e}")
|
| 196 |
-
|
| 197 |
-
return extracted_data, text_content
|
| 198 |
-
|
| 199 |
-
@staticmethod
|
| 200 |
-
def _merge_extractions(
|
| 201 |
-
dolphin_data: Dict[str, Any],
|
| 202 |
-
pdfplumber_data: Dict[str, Any],
|
| 203 |
-
) -> Dict[str, Any]:
|
| 204 |
-
"""
|
| 205 |
-
Merge Dolphin and pdfplumber extractions.
|
| 206 |
-
|
| 207 |
-
Priority: Dolphin fields take precedence.
|
| 208 |
-
pdfplumber fills any gaps not covered by Dolphin.
|
| 209 |
-
"""
|
| 210 |
-
merged = dict(dolphin_data) # Start with Dolphin data
|
| 211 |
-
|
| 212 |
-
for key, value in pdfplumber_data.items():
|
| 213 |
-
if key not in merged:
|
| 214 |
-
merged[key] = value
|
| 215 |
-
elif merged[key] == 0.0 and value != 0.0:
|
| 216 |
-
# If Dolphin gave 0 but pdfplumber found a value, prefer pdfplumber
|
| 217 |
-
merged[key] = value
|
| 218 |
-
|
| 219 |
-
return merged
|
| 220 |
-
|
| 221 |
-
# ==================================================================
|
| 222 |
-
# Report Construction (mirrors PDFParser logic)
|
| 223 |
-
# ==================================================================
|
| 224 |
-
|
| 225 |
-
@staticmethod
|
| 226 |
-
def _build_report(
|
| 227 |
-
extracted_data: Dict,
|
| 228 |
-
text_content: str,
|
| 229 |
-
file_path: str,
|
| 230 |
-
extraction_method: str,
|
| 231 |
-
classification=None,
|
| 232 |
-
dolphin_company_name: Optional[str] = None,
|
| 233 |
-
dolphin_fiscal_year: Optional[str] = None,
|
| 234 |
-
) -> FinancialReport:
|
| 235 |
-
"""Build a FinancialReport from merged extracted data."""
|
| 236 |
-
|
| 237 |
-
def get(key, default=0.0):
|
| 238 |
-
val = extracted_data.get(key)
|
| 239 |
-
return val if val is not None else default
|
| 240 |
-
|
| 241 |
-
# --- Income Statement ---
|
| 242 |
-
revenue = get("revenue")
|
| 243 |
-
cogs = get("cogs")
|
| 244 |
-
marketing = get("marketing_expenses")
|
| 245 |
-
payroll = get("payroll_expenses")
|
| 246 |
-
rent = get("rent_expense")
|
| 247 |
-
other = get("other_operating_expenses")
|
| 248 |
-
depreciation = get("depreciation")
|
| 249 |
-
amortization = get("amortization")
|
| 250 |
-
interest = get("interest_expense")
|
| 251 |
-
taxes = get("taxes")
|
| 252 |
-
|
| 253 |
-
op_expenses = marketing + payroll + rent + other
|
| 254 |
-
gross_profit = revenue - cogs
|
| 255 |
-
ebitda = gross_profit - op_expenses
|
| 256 |
-
op_income = ebitda - depreciation - amortization
|
| 257 |
-
net_income = op_income - interest - taxes
|
| 258 |
-
|
| 259 |
-
income = IncomeStatementStandard(
|
| 260 |
-
revenue=revenue, cogs=cogs,
|
| 261 |
-
marketing_expenses=marketing, payroll_expenses=payroll,
|
| 262 |
-
rent_expense=rent, other_operating_expenses=other,
|
| 263 |
-
depreciation=depreciation, amortization=amortization,
|
| 264 |
-
interest_expense=interest, taxes=taxes,
|
| 265 |
-
operating_expenses=op_expenses, gross_profit=gross_profit,
|
| 266 |
-
ebitda=ebitda, operating_income=op_income, net_income=net_income,
|
| 267 |
-
)
|
| 268 |
-
|
| 269 |
-
# --- Balance Sheet ---
|
| 270 |
-
cash = get("cash")
|
| 271 |
-
ar = get("accounts_receivable")
|
| 272 |
-
inv = get("inventory")
|
| 273 |
-
prepaid = get("prepaid_expenses")
|
| 274 |
-
ppe = get("property_plant_equipment")
|
| 275 |
-
accum_dep = get("accumulated_depreciation")
|
| 276 |
-
intangibles = get("intangible_assets")
|
| 277 |
-
ap = get("accounts_payable")
|
| 278 |
-
accrued = get("accrued_liabilities")
|
| 279 |
-
st_debt = get("short_term_debt")
|
| 280 |
-
lt_debt = get("long_term_debt")
|
| 281 |
-
deferred = get("deferred_revenue")
|
| 282 |
-
equity = get("total_equity")
|
| 283 |
-
|
| 284 |
-
bs_current_assets = cash + ar + inv + prepaid
|
| 285 |
-
bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
|
| 286 |
-
bs_current_liab = ap + accrued + st_debt
|
| 287 |
-
bs_total_liab = bs_current_liab + lt_debt + deferred
|
| 288 |
-
|
| 289 |
-
balance = BalanceSheetStandard(
|
| 290 |
-
cash=cash, accounts_receivable=ar, inventory=inv,
|
| 291 |
-
prepaid_expenses=prepaid, property_plant_equipment=ppe,
|
| 292 |
-
accumulated_depreciation=accum_dep, intangible_assets=intangibles,
|
| 293 |
-
accounts_payable=ap, accrued_liabilities=accrued,
|
| 294 |
-
short_term_debt=st_debt, long_term_debt=lt_debt,
|
| 295 |
-
deferred_revenue=deferred, total_equity=equity,
|
| 296 |
-
total_current_assets=bs_current_assets, total_assets=bs_total_assets,
|
| 297 |
-
total_current_liabilities=bs_current_liab, total_liabilities=bs_total_liab,
|
| 298 |
-
)
|
| 299 |
-
|
| 300 |
-
# --- Cash Flow ---
|
| 301 |
-
cash_flow = CashFlowStandard(
|
| 302 |
-
operating_cash_flow=get("operating_cash_flow"),
|
| 303 |
-
capex=get("capex"),
|
| 304 |
-
investing_cash_flow=get("investing_cash_flow"),
|
| 305 |
-
financing_cash_flow=get("financing_cash_flow"),
|
| 306 |
-
net_change_in_cash=get("net_change_in_cash"),
|
| 307 |
-
)
|
| 308 |
-
|
| 309 |
-
# --- Operating Metrics ---
|
| 310 |
-
metrics = OperatingMetrics(
|
| 311 |
-
industry="restaurant" if get("restaurant_margin") else "general",
|
| 312 |
-
new_customers=int(get("new_customers")) if get("new_customers") else None,
|
| 313 |
-
total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
|
| 314 |
-
total_seats=int(get("total_seats")) if get("total_seats") else None,
|
| 315 |
-
churn_rate=get("churn_rate") if get("churn_rate") else None,
|
| 316 |
-
cac=get("cac") if get("cac") else None,
|
| 317 |
-
ltv=get("ltv") if get("ltv") else None,
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
-
# --- Metadata ---
|
| 321 |
-
metadata = {
|
| 322 |
-
"extraction_method": extraction_method,
|
| 323 |
-
"extracted_restaurant_margin": str(get("restaurant_margin")),
|
| 324 |
-
"extracted_effective_tax_rate": str(get("effective_tax_rate")),
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
if classification:
|
| 328 |
-
metadata["document_type"] = classification.doc_type
|
| 329 |
-
metadata["classification_confidence"] = str(classification.confidence)
|
| 330 |
-
metadata["detected_sections"] = ",".join(classification.detected_sections)
|
| 331 |
-
|
| 332 |
-
# --- Company Name ---
|
| 333 |
-
company_name = HybridPDFParser._resolve_company_name(
|
| 334 |
-
dolphin_name=dolphin_company_name,
|
| 335 |
-
text_content=text_content,
|
| 336 |
-
file_path=file_path,
|
| 337 |
-
)
|
| 338 |
-
|
| 339 |
-
# --- Fiscal Year ---
|
| 340 |
-
fiscal_year_date = HybridPDFParser._resolve_fiscal_year(
|
| 341 |
-
dolphin_year=dolphin_fiscal_year,
|
| 342 |
-
text_content=text_content,
|
| 343 |
-
)
|
| 344 |
-
|
| 345 |
-
return FinancialReport(
|
| 346 |
-
company_name=company_name,
|
| 347 |
-
period_end=fiscal_year_date,
|
| 348 |
-
period_type=PeriodType.ANNUAL,
|
| 349 |
-
currency=Currency.USD,
|
| 350 |
-
income_statement=income,
|
| 351 |
-
balance_sheet=balance,
|
| 352 |
-
cash_flow=cash_flow,
|
| 353 |
-
metrics=metrics,
|
| 354 |
-
metadata=metadata,
|
| 355 |
-
)
|
| 356 |
-
|
| 357 |
-
# ==================================================================
|
| 358 |
-
# Name & Date Resolution
|
| 359 |
-
# ==================================================================
|
| 360 |
-
|
| 361 |
-
@staticmethod
|
| 362 |
-
def _resolve_company_name(
|
| 363 |
-
dolphin_name: Optional[str],
|
| 364 |
-
text_content: str,
|
| 365 |
-
file_path: str,
|
| 366 |
-
) -> str:
|
| 367 |
-
"""Resolve company name: Dolphin → text heuristics → filename."""
|
| 368 |
-
if dolphin_name:
|
| 369 |
-
return dolphin_name
|
| 370 |
-
|
| 371 |
-
# Reuse the existing PDFParser heuristics
|
| 372 |
-
from app.services.ingestion.parser_pdf import PDFParser
|
| 373 |
-
# We can't call PDFParser's name extraction directly (it's inline),
|
| 374 |
-
# so replicate the core logic:
|
| 375 |
-
|
| 376 |
-
lines = text_content.split("\n")
|
| 377 |
-
ignored = {
|
| 378 |
-
"TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
|
| 379 |
-
"CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
|
| 380 |
-
"10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
|
| 381 |
-
"SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
|
| 382 |
-
}
|
| 383 |
-
|
| 384 |
-
# SEC filing heuristic
|
| 385 |
-
registrant_idx = -1
|
| 386 |
-
for i, line in enumerate(lines[:100]):
|
| 387 |
-
if "exact name of registrant" in line.lower():
|
| 388 |
-
registrant_idx = i
|
| 389 |
-
break
|
| 390 |
-
|
| 391 |
-
if registrant_idx > 0:
|
| 392 |
-
for j in range(registrant_idx - 1, -1, -1):
|
| 393 |
-
candidate = lines[j].strip()
|
| 394 |
-
if len(candidate) > 2 and not any(ig in candidate.upper() for ig in ignored):
|
| 395 |
-
return candidate[:100]
|
| 396 |
-
|
| 397 |
-
# First meaningful line
|
| 398 |
-
for line in lines[:40]:
|
| 399 |
-
candidate = line.strip()
|
| 400 |
-
if (
|
| 401 |
-
len(candidate) > 2
|
| 402 |
-
and not any(ig in candidate.upper() for ig in ignored)
|
| 403 |
-
and not candidate.isdigit()
|
| 404 |
-
and any(c.isalpha() for c in candidate)
|
| 405 |
-
):
|
| 406 |
-
return candidate[:100]
|
| 407 |
-
|
| 408 |
-
# Filename fallback
|
| 409 |
-
import os
|
| 410 |
-
basename = os.path.basename(file_path)
|
| 411 |
-
return os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
|
| 412 |
-
|
| 413 |
-
@staticmethod
|
| 414 |
-
def _resolve_fiscal_year(
|
| 415 |
-
dolphin_year: Optional[str],
|
| 416 |
-
text_content: str,
|
| 417 |
-
) -> date:
|
| 418 |
-
"""Resolve fiscal year: Dolphin → text patterns → today."""
|
| 419 |
-
# Try Dolphin result first
|
| 420 |
-
if dolphin_year:
|
| 421 |
-
year_match = re.search(r"\d{4}", dolphin_year)
|
| 422 |
-
if year_match:
|
| 423 |
-
y = int(year_match.group(0))
|
| 424 |
-
if 1990 <= y <= date.today().year + 1:
|
| 425 |
-
return date(y, 12, 31)
|
| 426 |
-
|
| 427 |
-
# Reuse PDFParser's fiscal year extraction
|
| 428 |
-
from app.services.ingestion.parser_pdf import PDFParser
|
| 429 |
-
return PDFParser._extract_fiscal_year(text_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/parser_pdf.py
DELETED
|
@@ -1,402 +0,0 @@
|
|
| 1 |
-
import pdfplumber
|
| 2 |
-
import re
|
| 3 |
-
from typing import Dict, Any, Optional, List
|
| 4 |
-
from app.schemas.financial import (
|
| 5 |
-
FinancialReport,
|
| 6 |
-
BalanceSheetStandard,
|
| 7 |
-
IncomeStatementStandard,
|
| 8 |
-
CashFlowStandard,
|
| 9 |
-
OperatingMetrics,
|
| 10 |
-
PeriodType,
|
| 11 |
-
Currency
|
| 12 |
-
)
|
| 13 |
-
from datetime import date
|
| 14 |
-
from app.services.ingestion.mappings import DataMapper
|
| 15 |
-
|
| 16 |
-
class PDFParser:
|
| 17 |
-
@staticmethod
|
| 18 |
-
def parse(file_path: str) -> FinancialReport:
|
| 19 |
-
extracted_data = {}
|
| 20 |
-
text_content = ""
|
| 21 |
-
|
| 22 |
-
with pdfplumber.open(file_path) as pdf:
|
| 23 |
-
# 1. Statement Locator Strategy (Find Income, Balance, Cash Flow pages)
|
| 24 |
-
statement_pages = PDFParser._find_statement_pages(pdf)
|
| 25 |
-
|
| 26 |
-
# 2. Extract Data from Tables on those pages
|
| 27 |
-
for stmt_type, page in statement_pages.items():
|
| 28 |
-
print(f"Processing {stmt_type} on page {page.page_number}")
|
| 29 |
-
|
| 30 |
-
# Determine allowed fields based on statement type
|
| 31 |
-
allowed_fields = None
|
| 32 |
-
if stmt_type == "income":
|
| 33 |
-
allowed_fields = DataMapper.INCOME_FIELDS
|
| 34 |
-
elif stmt_type == "balance":
|
| 35 |
-
allowed_fields = DataMapper.BALANCE_FIELDS
|
| 36 |
-
elif stmt_type == "cash_flow":
|
| 37 |
-
allowed_fields = DataMapper.CASH_FIELDS
|
| 38 |
-
|
| 39 |
-
table_data = PDFParser._extract_table_data(page, allowed_fields)
|
| 40 |
-
extracted_data.update(table_data)
|
| 41 |
-
|
| 42 |
-
# 3. Global Text Extraction (for Regex Fallback & Metrics)
|
| 43 |
-
for page in pdf.pages:
|
| 44 |
-
text_content += page.extract_text() + "\n"
|
| 45 |
-
|
| 46 |
-
# 4. Fallback / Regex Strategy for missing fields
|
| 47 |
-
regex_data = PDFParser._extract_via_regex(text_content, existing_keys=extracted_data.keys())
|
| 48 |
-
extracted_data.update(regex_data)
|
| 49 |
-
|
| 50 |
-
# 5. Extract Fiscal Year
|
| 51 |
-
fiscal_year_date = PDFParser._extract_fiscal_year(text_content)
|
| 52 |
-
|
| 53 |
-
# 6. Construct Financial Objects
|
| 54 |
-
def get(key, default=0.0):
|
| 55 |
-
val = extracted_data.get(key)
|
| 56 |
-
if val is None:
|
| 57 |
-
return default
|
| 58 |
-
return val
|
| 59 |
-
|
| 60 |
-
# Calculate Computed Fields
|
| 61 |
-
revenue = get("revenue")
|
| 62 |
-
cogs = get("cogs")
|
| 63 |
-
marketing = get("marketing_expenses")
|
| 64 |
-
payroll = get("payroll_expenses")
|
| 65 |
-
rent = get("rent_expense")
|
| 66 |
-
other = get("other_operating_expenses")
|
| 67 |
-
depreciation = get("depreciation")
|
| 68 |
-
amortization = get("amortization")
|
| 69 |
-
interest = get("interest_expense")
|
| 70 |
-
taxes = get("taxes")
|
| 71 |
-
|
| 72 |
-
op_expenses = marketing + payroll + rent + other
|
| 73 |
-
gross_profit = revenue - cogs
|
| 74 |
-
ebitda = gross_profit - op_expenses
|
| 75 |
-
op_income = ebitda - depreciation - amortization
|
| 76 |
-
net_income = op_income - interest - taxes
|
| 77 |
-
|
| 78 |
-
income = IncomeStatementStandard(
|
| 79 |
-
revenue=revenue,
|
| 80 |
-
cogs=cogs,
|
| 81 |
-
marketing_expenses=marketing,
|
| 82 |
-
payroll_expenses=payroll,
|
| 83 |
-
rent_expense=rent,
|
| 84 |
-
other_operating_expenses=other,
|
| 85 |
-
depreciation=depreciation,
|
| 86 |
-
amortization=amortization,
|
| 87 |
-
interest_expense=interest,
|
| 88 |
-
taxes=taxes,
|
| 89 |
-
# Computed
|
| 90 |
-
operating_expenses=op_expenses,
|
| 91 |
-
gross_profit=gross_profit,
|
| 92 |
-
ebitda=ebitda,
|
| 93 |
-
operating_income=op_income,
|
| 94 |
-
net_income=net_income
|
| 95 |
-
)
|
| 96 |
-
|
| 97 |
-
# Balance Sheet Computed
|
| 98 |
-
cash = get("cash")
|
| 99 |
-
ar = get("accounts_receivable")
|
| 100 |
-
inv = get("inventory")
|
| 101 |
-
prepaid = get("prepaid_expenses")
|
| 102 |
-
ppe = get("property_plant_equipment")
|
| 103 |
-
accum_dep = get("accumulated_depreciation")
|
| 104 |
-
intangibles = get("intangible_assets")
|
| 105 |
-
|
| 106 |
-
ap = get("accounts_payable")
|
| 107 |
-
accrued = get("accrued_liabilities")
|
| 108 |
-
st_debt = get("short_term_debt")
|
| 109 |
-
lt_debt = get("long_term_debt")
|
| 110 |
-
deferred = get("deferred_revenue")
|
| 111 |
-
equity = get("total_equity")
|
| 112 |
-
|
| 113 |
-
bs_current_assets = cash + ar + inv + prepaid
|
| 114 |
-
bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
|
| 115 |
-
bs_current_liab = ap + accrued + st_debt
|
| 116 |
-
bs_total_liab = bs_current_liab + lt_debt + deferred
|
| 117 |
-
|
| 118 |
-
balance = BalanceSheetStandard(
|
| 119 |
-
cash=cash,
|
| 120 |
-
accounts_receivable=ar,
|
| 121 |
-
inventory=inv,
|
| 122 |
-
prepaid_expenses=prepaid,
|
| 123 |
-
property_plant_equipment=ppe,
|
| 124 |
-
accumulated_depreciation=accum_dep,
|
| 125 |
-
intangible_assets=intangibles,
|
| 126 |
-
accounts_payable=ap,
|
| 127 |
-
accrued_liabilities=accrued,
|
| 128 |
-
short_term_debt=st_debt,
|
| 129 |
-
long_term_debt=lt_debt,
|
| 130 |
-
deferred_revenue=deferred,
|
| 131 |
-
total_equity=equity,
|
| 132 |
-
# Computed
|
| 133 |
-
total_current_assets=bs_current_assets,
|
| 134 |
-
total_assets=bs_total_assets,
|
| 135 |
-
total_current_liabilities=bs_current_liab,
|
| 136 |
-
total_liabilities=bs_total_liab
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
cash_flow = CashFlowStandard(
|
| 140 |
-
operating_cash_flow=get("operating_cash_flow"),
|
| 141 |
-
capex=get("capex"),
|
| 142 |
-
investing_cash_flow=get("investing_cash_flow"),
|
| 143 |
-
financing_cash_flow=get("financing_cash_flow"),
|
| 144 |
-
net_change_in_cash=get("net_change_in_cash")
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
metrics = OperatingMetrics(
|
| 148 |
-
industry='restaurant' if get("restaurant_margin") else 'general',
|
| 149 |
-
new_customers=int(get("new_customers")) if get("new_customers") else None,
|
| 150 |
-
total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
|
| 151 |
-
total_seats=int(get("total_seats")) if get("total_seats") else None,
|
| 152 |
-
churn_rate=get("churn_rate") if get("churn_rate") else None,
|
| 153 |
-
cac=get("cac") if get("cac") else None,
|
| 154 |
-
ltv=get("ltv") if get("ltv") else None,
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
metadata = {
|
| 158 |
-
"extracted_restaurant_margin": str(get("restaurant_margin")),
|
| 159 |
-
"extracted_effective_tax_rate": str(get("effective_tax_rate"))
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
# Company Name Heuristic
|
| 163 |
-
company_name = "Detected via OCR"
|
| 164 |
-
name_found = False
|
| 165 |
-
|
| 166 |
-
# 1. SEC Filing Heuristic
|
| 167 |
-
registrant_marker = "Exact name of registrant"
|
| 168 |
-
registrant_index = -1
|
| 169 |
-
|
| 170 |
-
extracted_lines = text_content.split('\n')
|
| 171 |
-
|
| 172 |
-
for i, line in enumerate(extracted_lines[:100]):
|
| 173 |
-
if registrant_marker.lower() in line.lower():
|
| 174 |
-
registrant_index = i
|
| 175 |
-
break
|
| 176 |
-
|
| 177 |
-
if registrant_index > 0:
|
| 178 |
-
for j in range(registrant_index - 1, -1, -1):
|
| 179 |
-
candidate = extracted_lines[j].strip()
|
| 180 |
-
if len(candidate) > 2:
|
| 181 |
-
if "FORM" not in candidate.upper() and "UNITED STATES" not in candidate.upper():
|
| 182 |
-
company_name = candidate
|
| 183 |
-
name_found = True
|
| 184 |
-
break
|
| 185 |
-
|
| 186 |
-
# 2. Top-of-page Heuristic
|
| 187 |
-
if not name_found:
|
| 188 |
-
ignored_names = [
|
| 189 |
-
"TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
|
| 190 |
-
"CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
|
| 191 |
-
"10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
|
| 192 |
-
"SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
|
| 193 |
-
"COMMISSION FILE NUMBER", "TRANSITION REPORT", "QUARTERLY REPORT PURSUANT"
|
| 194 |
-
]
|
| 195 |
-
|
| 196 |
-
for line in extracted_lines[:40]:
|
| 197 |
-
candidate = line.strip()
|
| 198 |
-
if (len(candidate) > 2
|
| 199 |
-
and not any(ignore in candidate.upper() for ignore in ignored_names)
|
| 200 |
-
and not candidate.isdigit()
|
| 201 |
-
and not "FILE NUMBER" in candidate.upper()):
|
| 202 |
-
|
| 203 |
-
if any(c.isalpha() for c in candidate):
|
| 204 |
-
company_name = candidate[:100]
|
| 205 |
-
name_found = True
|
| 206 |
-
break
|
| 207 |
-
|
| 208 |
-
# 3. Filename Fallback
|
| 209 |
-
if not name_found or company_name == "Detected via OCR":
|
| 210 |
-
import os
|
| 211 |
-
basename = os.path.basename(file_path)
|
| 212 |
-
company_name = os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
|
| 213 |
-
|
| 214 |
-
return PDFParser._finalize_report(company_name, income, balance, cash_flow, metrics, metadata, fiscal_year_date)
|
| 215 |
-
|
| 216 |
-
@staticmethod
|
| 217 |
-
def _finalize_report(name, income, balance, cash, metrics, meta, period_end):
|
| 218 |
-
"""Helper to construct the final object"""
|
| 219 |
-
return FinancialReport(
|
| 220 |
-
company_name=name,
|
| 221 |
-
period_end=period_end,
|
| 222 |
-
period_type=PeriodType.ANNUAL,
|
| 223 |
-
currency=Currency.USD,
|
| 224 |
-
income_statement=income,
|
| 225 |
-
balance_sheet=balance,
|
| 226 |
-
cash_flow=cash,
|
| 227 |
-
metrics=metrics,
|
| 228 |
-
metadata=meta
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
@staticmethod
|
| 232 |
-
def _extract_fiscal_year(text: str) -> date:
|
| 233 |
-
"""Finds the fiscal year end date from the text."""
|
| 234 |
-
# Pattern 1: Year Ended December 31, 2024
|
| 235 |
-
# Pattern 2: Period Ended ...
|
| 236 |
-
patterns = [
|
| 237 |
-
r"(?:YEAR|PERIOD|FISCAL YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
|
| 238 |
-
r"DECEMBER\s+31,\s+(\d{4})"
|
| 239 |
-
]
|
| 240 |
-
|
| 241 |
-
current_year = date.today().year
|
| 242 |
-
found_years = []
|
| 243 |
-
|
| 244 |
-
for pat in patterns:
|
| 245 |
-
matches = re.findall(pat, text[:5000], re.IGNORECASE) # Search first 5000 chars
|
| 246 |
-
for m in matches:
|
| 247 |
-
if isinstance(m, tuple): m = m[0]
|
| 248 |
-
# Extract year digit
|
| 249 |
-
year_match = re.search(r"\d{4}", m)
|
| 250 |
-
if year_match:
|
| 251 |
-
y = int(year_match.group(0))
|
| 252 |
-
if 1990 <= y <= current_year + 1:
|
| 253 |
-
found_years.append(y)
|
| 254 |
-
|
| 255 |
-
if found_years:
|
| 256 |
-
# Most frequent or max year? Usually max year in the header is the current report year.
|
| 257 |
-
best_year = max(found_years)
|
| 258 |
-
return date(best_year, 12, 31) # Default to Dec 31
|
| 259 |
-
|
| 260 |
-
return date.today()
|
| 261 |
-
|
| 262 |
-
@staticmethod
|
| 263 |
-
def _find_statement_pages(pdf) -> Dict[str, Any]:
|
| 264 |
-
""" Identifies pages containing specific financial statements. """
|
| 265 |
-
pages = {}
|
| 266 |
-
for page in pdf.pages:
|
| 267 |
-
text = (page.extract_text() or "").upper()
|
| 268 |
-
|
| 269 |
-
# Skip Table of Contents pages (unless they contain financial data like '$')
|
| 270 |
-
if ("TABLE OF CONTENTS" in text[:500] or "INDEX" in text[:200]) and "$" not in text[:2000]:
|
| 271 |
-
continue
|
| 272 |
-
|
| 273 |
-
# Expanded Keywords
|
| 274 |
-
# Income
|
| 275 |
-
if any(x in text for x in ["CONSOLIDATED STATEMENTS OF OPERATIONS", "CONSOLIDATED STATEMENTS OF INCOME", "CONSOLIDATED STATEMENTS OF EARNINGS", "DISSOLIDATED STATEMENTS OF LOSS", "STATEMENT OF INCOME", "STATEMENTS OF OPERATIONS"]):
|
| 276 |
-
if "income" not in pages: pages["income"] = page
|
| 277 |
-
|
| 278 |
-
# Balance
|
| 279 |
-
elif any(x in text for x in ["CONSOLIDATED BALANCE SHEETS", "CONSOLIDATED STATEMENTS OF FINANCIAL POSITION", "BALANCE SHEETS", "FINANCIAL POSITION"]):
|
| 280 |
-
if "balance" not in pages: pages["balance"] = page
|
| 281 |
-
|
| 282 |
-
# Cash Flow
|
| 283 |
-
elif any(x in text for x in ["CONSOLIDATED STATEMENTS OF CASH FLOWS", "CONSOLIDATED STATEMENT OF CASH FLOWS", "STATEMENTS OF CASH FLOWS", "CASH FLOWS"]):
|
| 284 |
-
if "cash_flow" not in pages: pages["cash_flow"] = page
|
| 285 |
-
|
| 286 |
-
return pages
|
| 287 |
-
|
| 288 |
-
@staticmethod
|
| 289 |
-
def _extract_table_data(page, allowed_fields: Optional[List[str]] = None) -> Dict[str, float]:
|
| 290 |
-
""" Extracts key-value pairs from tables on the page with smart column selection. """
|
| 291 |
-
data = {}
|
| 292 |
-
tables = page.extract_tables()
|
| 293 |
-
|
| 294 |
-
for table in tables:
|
| 295 |
-
# 1. Identify "Current Year" Column
|
| 296 |
-
# Scan first 5 rows for years (e.g., 2024, 2023)
|
| 297 |
-
target_col_idx = -1
|
| 298 |
-
max_year = 0
|
| 299 |
-
|
| 300 |
-
headers = table[:5]
|
| 301 |
-
for row in headers:
|
| 302 |
-
for idx, cell in enumerate(row):
|
| 303 |
-
if not cell: continue
|
| 304 |
-
# Look for year pattern
|
| 305 |
-
# Check for 4 digits that look like a recent year
|
| 306 |
-
cleaned = cell.replace("$", "").strip()
|
| 307 |
-
if re.match(r"^\d{4}$", cleaned):
|
| 308 |
-
y = int(cleaned)
|
| 309 |
-
if 2000 < y < 2100:
|
| 310 |
-
if y > max_year:
|
| 311 |
-
max_year = y
|
| 312 |
-
target_col_idx = idx
|
| 313 |
-
|
| 314 |
-
# If no year found, default to finding first numeric column later
|
| 315 |
-
|
| 316 |
-
# 2. Header-based Scaling Detection
|
| 317 |
-
# Look for "(in thousands)", "(in millions)", "($ in millions)", etc.
|
| 318 |
-
multiplier = 1.0
|
| 319 |
-
|
| 320 |
-
# Scan top of page text (first 1000 chars) or table headers
|
| 321 |
-
header_text = (page.extract_text() or "")[:1000].lower()
|
| 322 |
-
if "in millions" in header_text or "in 000s" in header_text.replace(",", ""):
|
| 323 |
-
# Distinct from "in thousands" - some 10ks say "in 000s" meaning thousands, but let's stick to standard text
|
| 324 |
-
pass
|
| 325 |
-
|
| 326 |
-
if re.search(r"\(in millions\)|in millions, except|dollares en millones|amounts in millions|dollars in millions", header_text):
|
| 327 |
-
multiplier = 1000000.0
|
| 328 |
-
elif re.search(r"\(in thousands\)|in thousands, except|dollares en miles|amounts in thousands|dollars in thousands|\(in 000s\)", header_text):
|
| 329 |
-
multiplier = 1000.0
|
| 330 |
-
|
| 331 |
-
# Override if strict detected
|
| 332 |
-
print(f"Detected scale multiplier: {multiplier}")
|
| 333 |
-
|
| 334 |
-
for row in table:
|
| 335 |
-
if not row or not row[0]: continue
|
| 336 |
-
|
| 337 |
-
label = row[0]
|
| 338 |
-
mapped_field = DataMapper.map_row(label)
|
| 339 |
-
|
| 340 |
-
if mapped_field:
|
| 341 |
-
if allowed_fields is not None and mapped_field not in allowed_fields:
|
| 342 |
-
continue
|
| 343 |
-
|
| 344 |
-
# Extract Value
|
| 345 |
-
val = None
|
| 346 |
-
if target_col_idx != -1 and target_col_idx < len(row):
|
| 347 |
-
# TRUSTED COLUMN
|
| 348 |
-
val = PDFParser._clean_value(row[target_col_idx])
|
| 349 |
-
else:
|
| 350 |
-
# FALLBACK: First numeric column
|
| 351 |
-
for col_val in row[1:]:
|
| 352 |
-
clean_val = PDFParser._clean_value(col_val)
|
| 353 |
-
if clean_val is not None:
|
| 354 |
-
val = clean_val
|
| 355 |
-
break
|
| 356 |
-
|
| 357 |
-
if val is not None:
|
| 358 |
-
data[mapped_field] = val * multiplier
|
| 359 |
-
return data
|
| 360 |
-
|
| 361 |
-
@staticmethod
|
| 362 |
-
def _clean_value(val_str: Optional[str]) -> Optional[float]:
|
| 363 |
-
""" Converts financial string formats to float. Handles parentheses for negative. """
|
| 364 |
-
if not val_str:
|
| 365 |
-
return None
|
| 366 |
-
|
| 367 |
-
s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
|
| 368 |
-
if not s:
|
| 369 |
-
return None
|
| 370 |
-
|
| 371 |
-
# Handle (123) as negative
|
| 372 |
-
if "(" in s and ")" in s:
|
| 373 |
-
s = s.replace("(", "-").replace(")", "")
|
| 374 |
-
|
| 375 |
-
# Handle - as 0 (accounting format sometimes uses - for 0)
|
| 376 |
-
if s == "-" or s == "—":
|
| 377 |
-
return 0.0
|
| 378 |
-
|
| 379 |
-
try:
|
| 380 |
-
return float(s)
|
| 381 |
-
except ValueError:
|
| 382 |
-
return None
|
| 383 |
-
|
| 384 |
-
@staticmethod
|
| 385 |
-
def _extract_via_regex(text_content: str, existing_keys: List[str]) -> Dict[str, float]:
|
| 386 |
-
""" Fallback extraction for items not found in tables. """
|
| 387 |
-
data = {}
|
| 388 |
-
# Iterate over all mappings, skip if already found
|
| 389 |
-
for field, aliases in DataMapper.FIELD_MAPPING.items():
|
| 390 |
-
if field in existing_keys:
|
| 391 |
-
continue
|
| 392 |
-
|
| 393 |
-
for k in aliases:
|
| 394 |
-
# Regex matches "Keyword $1,234.56" or "Keyword....... 1,234.56"
|
| 395 |
-
pattern = re.compile(rf"{k}[^0-9-]*?(\(?[\d,]+\.?\d*\)?)", re.IGNORECASE)
|
| 396 |
-
match = pattern.search(text_content)
|
| 397 |
-
if match:
|
| 398 |
-
val = PDFParser._clean_value(match.group(1))
|
| 399 |
-
if val is not None:
|
| 400 |
-
data[field] = val
|
| 401 |
-
break
|
| 402 |
-
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/parser_xlsx.py
DELETED
|
@@ -1,312 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
XLSX Parser - Excel file parsing for financial data.
|
| 3 |
-
|
| 4 |
-
Parses Excel workbooks containing financial statements, handling:
|
| 5 |
-
- Multi-sheet detection (Income Statement, Balance Sheet, Cash Flow)
|
| 6 |
-
- Single-sheet condensed format
|
| 7 |
-
- Various column/row layouts
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import re
|
| 11 |
-
from typing import Dict, Any, Optional, List
|
| 12 |
-
from datetime import date
|
| 13 |
-
|
| 14 |
-
try:
|
| 15 |
-
import openpyxl
|
| 16 |
-
from openpyxl import load_workbook
|
| 17 |
-
from openpyxl.worksheet.worksheet import Worksheet
|
| 18 |
-
except ImportError:
|
| 19 |
-
openpyxl = None
|
| 20 |
-
|
| 21 |
-
import pandas as pd
|
| 22 |
-
|
| 23 |
-
from app.schemas.financial import (
|
| 24 |
-
FinancialReport,
|
| 25 |
-
BalanceSheetStandard,
|
| 26 |
-
IncomeStatementStandard,
|
| 27 |
-
CashFlowStandard,
|
| 28 |
-
OperatingMetrics,
|
| 29 |
-
PeriodType,
|
| 30 |
-
Currency
|
| 31 |
-
)
|
| 32 |
-
from app.services.ingestion.mappings import DataMapper
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
class XLSXParser:
|
| 36 |
-
"""Parser for Excel (.xlsx, .xls) financial files."""
|
| 37 |
-
|
| 38 |
-
# Keywords to identify sheet types
|
| 39 |
-
INCOME_KEYWORDS = ['income', 'p&l', 'profit', 'loss', 'revenue', 'earnings']
|
| 40 |
-
BALANCE_KEYWORDS = ['balance', 'assets', 'liabilities', 'position']
|
| 41 |
-
CASHFLOW_KEYWORDS = ['cash flow', 'cashflow', 'cash', 'liquidity']
|
| 42 |
-
|
| 43 |
-
@staticmethod
|
| 44 |
-
def parse(file_path: str) -> FinancialReport:
|
| 45 |
-
"""
|
| 46 |
-
Parse an Excel file and return a standardized FinancialReport.
|
| 47 |
-
|
| 48 |
-
Handles both multi-sheet and single-sheet formats.
|
| 49 |
-
"""
|
| 50 |
-
if openpyxl is None:
|
| 51 |
-
# Fallback to pandas-only parsing
|
| 52 |
-
return XLSXParser._parse_with_pandas(file_path)
|
| 53 |
-
|
| 54 |
-
try:
|
| 55 |
-
wb = load_workbook(file_path, data_only=True)
|
| 56 |
-
|
| 57 |
-
# Categorize sheets
|
| 58 |
-
income_sheet = None
|
| 59 |
-
balance_sheet = None
|
| 60 |
-
cashflow_sheet = None
|
| 61 |
-
|
| 62 |
-
for sheet_name in wb.sheetnames:
|
| 63 |
-
name_lower = sheet_name.lower()
|
| 64 |
-
|
| 65 |
-
if any(kw in name_lower for kw in XLSXParser.INCOME_KEYWORDS):
|
| 66 |
-
income_sheet = wb[sheet_name]
|
| 67 |
-
elif any(kw in name_lower for kw in XLSXParser.BALANCE_KEYWORDS):
|
| 68 |
-
balance_sheet = wb[sheet_name]
|
| 69 |
-
elif any(kw in name_lower for kw in XLSXParser.CASHFLOW_KEYWORDS):
|
| 70 |
-
cashflow_sheet = wb[sheet_name]
|
| 71 |
-
|
| 72 |
-
# If no specialized sheets found, use first sheet for all
|
| 73 |
-
if not income_sheet and not balance_sheet and not cashflow_sheet:
|
| 74 |
-
default_sheet = wb.active
|
| 75 |
-
income_sheet = balance_sheet = cashflow_sheet = default_sheet
|
| 76 |
-
|
| 77 |
-
# Extract data from each sheet
|
| 78 |
-
data_dict = {}
|
| 79 |
-
|
| 80 |
-
if income_sheet:
|
| 81 |
-
data_dict.update(XLSXParser._extract_from_sheet(income_sheet))
|
| 82 |
-
if balance_sheet and balance_sheet != income_sheet:
|
| 83 |
-
data_dict.update(XLSXParser._extract_from_sheet(balance_sheet))
|
| 84 |
-
if cashflow_sheet and cashflow_sheet != income_sheet and cashflow_sheet != balance_sheet:
|
| 85 |
-
data_dict.update(XLSXParser._extract_from_sheet(cashflow_sheet))
|
| 86 |
-
|
| 87 |
-
# If still no data, try pandas fallback
|
| 88 |
-
if not data_dict:
|
| 89 |
-
return XLSXParser._parse_with_pandas(file_path)
|
| 90 |
-
|
| 91 |
-
# Extract company name from filename or first cell
|
| 92 |
-
company_name = XLSXParser._extract_company_name(wb)
|
| 93 |
-
|
| 94 |
-
return XLSXParser._build_report(data_dict, company_name)
|
| 95 |
-
|
| 96 |
-
except Exception as e:
|
| 97 |
-
# Fallback to pandas
|
| 98 |
-
print(f"openpyxl parse failed, falling back to pandas: {e}")
|
| 99 |
-
return XLSXParser._parse_with_pandas(file_path)
|
| 100 |
-
|
| 101 |
-
@staticmethod
|
| 102 |
-
def _extract_from_sheet(sheet: 'Worksheet') -> Dict[str, float]:
|
| 103 |
-
"""Extract financial data from a worksheet."""
|
| 104 |
-
data = {}
|
| 105 |
-
|
| 106 |
-
# Try to find the data range
|
| 107 |
-
# Look for rows with label in first column and numeric value in subsequent columns
|
| 108 |
-
for row in sheet.iter_rows(min_row=1, max_row=min(200, sheet.max_row)):
|
| 109 |
-
if not row or not row[0].value:
|
| 110 |
-
continue
|
| 111 |
-
|
| 112 |
-
label = str(row[0].value).strip()
|
| 113 |
-
field = DataMapper.map_row(label)
|
| 114 |
-
|
| 115 |
-
if field:
|
| 116 |
-
# Find the first non-empty numeric value in this row
|
| 117 |
-
for cell in row[1:]:
|
| 118 |
-
if cell.value is not None:
|
| 119 |
-
try:
|
| 120 |
-
val = XLSXParser._clean_value(cell.value)
|
| 121 |
-
if val is not None:
|
| 122 |
-
data[field] = val
|
| 123 |
-
break
|
| 124 |
-
except:
|
| 125 |
-
continue
|
| 126 |
-
|
| 127 |
-
return data
|
| 128 |
-
|
| 129 |
-
@staticmethod
|
| 130 |
-
def _clean_value(val: Any) -> Optional[float]:
|
| 131 |
-
"""Clean and convert a cell value to float."""
|
| 132 |
-
if val is None:
|
| 133 |
-
return None
|
| 134 |
-
if isinstance(val, (int, float)):
|
| 135 |
-
return float(val)
|
| 136 |
-
if isinstance(val, str):
|
| 137 |
-
# Remove currency symbols, commas, parentheses for negatives
|
| 138 |
-
cleaned = re.sub(r'[,$]', '', val.strip())
|
| 139 |
-
# Handle (1000) format for negatives
|
| 140 |
-
if cleaned.startswith('(') and cleaned.endswith(')'):
|
| 141 |
-
cleaned = '-' + cleaned[1:-1]
|
| 142 |
-
try:
|
| 143 |
-
return float(cleaned)
|
| 144 |
-
except ValueError:
|
| 145 |
-
return None
|
| 146 |
-
return None
|
| 147 |
-
|
| 148 |
-
@staticmethod
|
| 149 |
-
def _extract_company_name(wb) -> str:
|
| 150 |
-
"""Try to extract company name from workbook."""
|
| 151 |
-
# Check first sheet, first few cells
|
| 152 |
-
sheet = wb.active
|
| 153 |
-
for row in sheet.iter_rows(min_row=1, max_row=5, max_col=3):
|
| 154 |
-
for cell in row:
|
| 155 |
-
if cell.value and isinstance(cell.value, str):
|
| 156 |
-
val = cell.value.strip()
|
| 157 |
-
# Skip common headers
|
| 158 |
-
if len(val) > 3 and len(val) < 100:
|
| 159 |
-
lower = val.lower()
|
| 160 |
-
if not any(kw in lower for kw in ['balance', 'income', 'cash', 'statement', 'period', 'date', 'quarter', 'annual']):
|
| 161 |
-
return val
|
| 162 |
-
return "Imported Company"
|
| 163 |
-
|
| 164 |
-
@staticmethod
|
| 165 |
-
def _parse_with_pandas(file_path: str) -> FinancialReport:
|
| 166 |
-
"""Fallback parsing using pandas."""
|
| 167 |
-
try:
|
| 168 |
-
# Read all sheets
|
| 169 |
-
xl = pd.ExcelFile(file_path)
|
| 170 |
-
data_dict = {}
|
| 171 |
-
|
| 172 |
-
for sheet_name in xl.sheet_names:
|
| 173 |
-
df = pd.read_excel(xl, sheet_name=sheet_name)
|
| 174 |
-
|
| 175 |
-
if df.empty:
|
| 176 |
-
continue
|
| 177 |
-
|
| 178 |
-
# Try vertical format (label in col 0, value in col 1+)
|
| 179 |
-
if len(df.columns) >= 2:
|
| 180 |
-
for _, row in df.iterrows():
|
| 181 |
-
label = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
|
| 182 |
-
field = DataMapper.map_row(label)
|
| 183 |
-
if field:
|
| 184 |
-
# Find first numeric value
|
| 185 |
-
for val in row.iloc[1:]:
|
| 186 |
-
if pd.notna(val):
|
| 187 |
-
try:
|
| 188 |
-
data_dict[field] = float(str(val).replace(',', '').replace('$', ''))
|
| 189 |
-
break
|
| 190 |
-
except:
|
| 191 |
-
continue
|
| 192 |
-
|
| 193 |
-
return XLSXParser._build_report(data_dict, "Imported Company")
|
| 194 |
-
|
| 195 |
-
except Exception as e:
|
| 196 |
-
print(f"Pandas XLSX parse failed: {e}")
|
| 197 |
-
return XLSXParser._build_empty_report()
|
| 198 |
-
|
| 199 |
-
@staticmethod
|
| 200 |
-
def _build_report(data_dict: Dict[str, float], company_name: str) -> FinancialReport:
|
| 201 |
-
"""Build FinancialReport from extracted data."""
|
| 202 |
-
def get(key: str, default: float = 0.0) -> float:
|
| 203 |
-
return data_dict.get(key, default)
|
| 204 |
-
|
| 205 |
-
# Computed Income
|
| 206 |
-
revenue = get("revenue")
|
| 207 |
-
cogs = get("cogs")
|
| 208 |
-
marketing = get("marketing_expenses")
|
| 209 |
-
payroll = get("payroll_expenses")
|
| 210 |
-
rent = get("rent_expense")
|
| 211 |
-
other = get("other_operating_expenses")
|
| 212 |
-
depreciation = get("depreciation")
|
| 213 |
-
amortization = get("amortization")
|
| 214 |
-
interest = get("interest_expense")
|
| 215 |
-
taxes = get("taxes")
|
| 216 |
-
|
| 217 |
-
op_expenses = marketing + payroll + rent + other
|
| 218 |
-
gross_profit = revenue - cogs
|
| 219 |
-
ebitda = gross_profit - op_expenses
|
| 220 |
-
op_income = ebitda - depreciation - amortization
|
| 221 |
-
net_income = op_income - interest - taxes
|
| 222 |
-
|
| 223 |
-
income = IncomeStatementStandard(
|
| 224 |
-
revenue=revenue,
|
| 225 |
-
cogs=cogs,
|
| 226 |
-
marketing_expenses=marketing,
|
| 227 |
-
payroll_expenses=payroll,
|
| 228 |
-
rent_expense=rent,
|
| 229 |
-
other_operating_expenses=other,
|
| 230 |
-
depreciation=depreciation,
|
| 231 |
-
amortization=amortization,
|
| 232 |
-
interest_expense=interest,
|
| 233 |
-
taxes=taxes,
|
| 234 |
-
# Computed
|
| 235 |
-
operating_expenses=op_expenses,
|
| 236 |
-
gross_profit=gross_profit,
|
| 237 |
-
ebitda=ebitda,
|
| 238 |
-
operating_income=op_income,
|
| 239 |
-
net_income=net_income
|
| 240 |
-
)
|
| 241 |
-
|
| 242 |
-
# Computed Balance
|
| 243 |
-
cash = get("cash")
|
| 244 |
-
ar = get("accounts_receivable")
|
| 245 |
-
inv = get("inventory")
|
| 246 |
-
prepaid = get("prepaid_expenses")
|
| 247 |
-
ppe = get("property_plant_equipment")
|
| 248 |
-
accum_dep = get("accumulated_depreciation")
|
| 249 |
-
intangibles = get("intangible_assets")
|
| 250 |
-
|
| 251 |
-
ap = get("accounts_payable")
|
| 252 |
-
accrued = get("accrued_liabilities")
|
| 253 |
-
st_debt = get("short_term_debt")
|
| 254 |
-
lt_debt = get("long_term_debt")
|
| 255 |
-
deferred = get("deferred_revenue")
|
| 256 |
-
equity = get("total_equity")
|
| 257 |
-
|
| 258 |
-
bs_current_assets = cash + ar + inv + prepaid
|
| 259 |
-
bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
|
| 260 |
-
bs_current_liab = ap + accrued + st_debt
|
| 261 |
-
bs_total_liab = bs_current_liab + lt_debt + deferred
|
| 262 |
-
|
| 263 |
-
balance = BalanceSheetStandard(
|
| 264 |
-
cash=cash,
|
| 265 |
-
accounts_receivable=ar,
|
| 266 |
-
inventory=inv,
|
| 267 |
-
prepaid_expenses=prepaid,
|
| 268 |
-
property_plant_equipment=ppe,
|
| 269 |
-
accumulated_depreciation=accum_dep,
|
| 270 |
-
intangible_assets=intangibles,
|
| 271 |
-
accounts_payable=ap,
|
| 272 |
-
accrued_liabilities=accrued,
|
| 273 |
-
short_term_debt=st_debt,
|
| 274 |
-
long_term_debt=lt_debt,
|
| 275 |
-
deferred_revenue=deferred,
|
| 276 |
-
total_equity=equity,
|
| 277 |
-
# Computed
|
| 278 |
-
total_current_assets=bs_current_assets,
|
| 279 |
-
total_assets=bs_total_assets,
|
| 280 |
-
total_current_liabilities=bs_current_liab,
|
| 281 |
-
total_liabilities=bs_total_liab
|
| 282 |
-
)
|
| 283 |
-
|
| 284 |
-
cash_flow = CashFlowStandard(
|
| 285 |
-
operating_cash_flow=get("operating_cash_flow"),
|
| 286 |
-
capex=get("capex"),
|
| 287 |
-
investing_cash_flow=get("investing_cash_flow"),
|
| 288 |
-
financing_cash_flow=get("financing_cash_flow")
|
| 289 |
-
)
|
| 290 |
-
|
| 291 |
-
metrics = OperatingMetrics(
|
| 292 |
-
industry='general',
|
| 293 |
-
new_customers=int(get("new_customers")) if get("new_customers") else None,
|
| 294 |
-
total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
|
| 295 |
-
total_seats=int(get("total_seats")) if get("total_seats") else None
|
| 296 |
-
)
|
| 297 |
-
|
| 298 |
-
return FinancialReport(
|
| 299 |
-
company_name=company_name,
|
| 300 |
-
period_end=date.today(),
|
| 301 |
-
period_type=PeriodType.ANNUAL,
|
| 302 |
-
currency=Currency.USD,
|
| 303 |
-
income_statement=income,
|
| 304 |
-
balance_sheet=balance,
|
| 305 |
-
cash_flow=cash_flow,
|
| 306 |
-
metrics=metrics
|
| 307 |
-
)
|
| 308 |
-
|
| 309 |
-
@staticmethod
|
| 310 |
-
def _build_empty_report() -> FinancialReport:
|
| 311 |
-
"""Build an empty report as last resort."""
|
| 312 |
-
return XLSXParser._build_report({}, "Unknown Company")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/ingestion/unified_parser.py
DELETED
|
@@ -1,84 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Unified Parser - Central coordinator for all file format parsing.
|
| 3 |
-
|
| 4 |
-
This module provides a single entry point for parsing any supported
|
| 5 |
-
financial document format (CSV, PDF, XLSX).
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from typing import Tuple
|
| 9 |
-
from app.schemas.financial import FinancialReport
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class UnifiedParser:
|
| 13 |
-
"""
|
| 14 |
-
Central parser that routes files to appropriate format-specific parsers.
|
| 15 |
-
|
| 16 |
-
Supported formats:
|
| 17 |
-
- CSV: Comma-separated values
|
| 18 |
-
- PDF: PDF documents (10-K, 10-Q, financial reports)
|
| 19 |
-
- XLSX/XLS: Excel workbooks
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
SUPPORTED_EXTENSIONS = {
|
| 23 |
-
'csv': 'csv',
|
| 24 |
-
'pdf': 'pdf',
|
| 25 |
-
'xlsx': 'xlsx',
|
| 26 |
-
'xls': 'xlsx', # Route both to XLSX parser
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
@staticmethod
|
| 30 |
-
def get_format(filename: str) -> str:
|
| 31 |
-
"""
|
| 32 |
-
Determine file format from filename.
|
| 33 |
-
|
| 34 |
-
Returns: 'csv', 'pdf', 'xlsx', or raises ValueError
|
| 35 |
-
"""
|
| 36 |
-
ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
|
| 37 |
-
|
| 38 |
-
if ext not in UnifiedParser.SUPPORTED_EXTENSIONS:
|
| 39 |
-
raise ValueError(f"Unsupported file format: .{ext}. Supported: .csv, .pdf, .xlsx, .xls")
|
| 40 |
-
|
| 41 |
-
return UnifiedParser.SUPPORTED_EXTENSIONS[ext]
|
| 42 |
-
|
| 43 |
-
@staticmethod
|
| 44 |
-
def parse(file_path: str, filename: str) -> FinancialReport:
|
| 45 |
-
"""
|
| 46 |
-
Parse a financial document and return standardized FinancialReport.
|
| 47 |
-
|
| 48 |
-
Args:
|
| 49 |
-
file_path: Path to the saved file on disk
|
| 50 |
-
filename: Original filename (used for format detection)
|
| 51 |
-
|
| 52 |
-
Returns:
|
| 53 |
-
FinancialReport with standardized financial data
|
| 54 |
-
|
| 55 |
-
Raises:
|
| 56 |
-
ValueError: If file format is not supported
|
| 57 |
-
"""
|
| 58 |
-
fmt = UnifiedParser.get_format(filename)
|
| 59 |
-
|
| 60 |
-
if fmt == 'csv':
|
| 61 |
-
from app.services.ingestion.parser_csv import CSVParser
|
| 62 |
-
return CSVParser.parse(file_path)
|
| 63 |
-
|
| 64 |
-
elif fmt == 'pdf':
|
| 65 |
-
from app.services.ingestion.parser_dolphin import HybridPDFParser
|
| 66 |
-
return HybridPDFParser.parse(file_path)
|
| 67 |
-
|
| 68 |
-
elif fmt == 'xlsx':
|
| 69 |
-
from app.services.ingestion.parser_xlsx import XLSXParser
|
| 70 |
-
return XLSXParser.parse(file_path)
|
| 71 |
-
|
| 72 |
-
else:
|
| 73 |
-
raise ValueError(f"No parser available for format: {fmt}")
|
| 74 |
-
|
| 75 |
-
@staticmethod
|
| 76 |
-
def is_supported(filename: str) -> bool:
|
| 77 |
-
"""Check if a filename has a supported extension."""
|
| 78 |
-
ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
|
| 79 |
-
return ext in UnifiedParser.SUPPORTED_EXTENSIONS
|
| 80 |
-
|
| 81 |
-
@staticmethod
|
| 82 |
-
def get_supported_extensions() -> list:
|
| 83 |
-
"""Return list of supported file extensions."""
|
| 84 |
-
return list(UnifiedParser.SUPPORTED_EXTENSIONS.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/intelligence/ai_cfo.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
class AICFOService:
|
| 5 |
-
@staticmethod
|
| 6 |
-
def generate_executive_summary(data: StandardizedDataPackage) -> str:
|
| 7 |
-
"""
|
| 8 |
-
Generates a natural language executive summary using a generative AI model.
|
| 9 |
-
Currently scaffolds the prompt construction and mocks the response if no API key is present.
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
# 1. Construct Context
|
| 13 |
-
company = data.raw_data.company_name
|
| 14 |
-
revenue = data.raw_data.income_statement.revenue
|
| 15 |
-
margin = data.kpis.net_margin
|
| 16 |
-
score = data.risk_analysis.risk_score
|
| 17 |
-
|
| 18 |
-
prompt = f"""
|
| 19 |
-
You are an elite CFO advising the CEO of {company}.
|
| 20 |
-
Financial Snapshot:
|
| 21 |
-
- Annual Revenue: ${revenue:,.2f}
|
| 22 |
-
- Net Margin: {margin:.1f}%
|
| 23 |
-
- Overall Risk Score: {score}/100
|
| 24 |
-
- Top Pain Points: {', '.join([p for p in data.insights if 'Pain' in p])}
|
| 25 |
-
|
| 26 |
-
Write a 3-paragraph executive summary:
|
| 27 |
-
1. The Good: What is working well?
|
| 28 |
-
2. The Bad: What are the immediate risks?
|
| 29 |
-
3. The Ugly: What needs drastic change immediately?
|
| 30 |
-
|
| 31 |
-
Keep it punchy, professional, and actionable.
|
| 32 |
-
"""
|
| 33 |
-
|
| 34 |
-
# 2. Call LLM (Placeholder for Gemini)
|
| 35 |
-
# api_key = os.getenv("GEMINI_API_KEY")
|
| 36 |
-
# if api_key:
|
| 37 |
-
# return call_gemini(api_key, prompt)
|
| 38 |
-
|
| 39 |
-
# 3. Mock Response (Fallback)
|
| 40 |
-
return (
|
| 41 |
-
f"## Executive Summary for {company}\n\n"
|
| 42 |
-
"**The Good:**\n"
|
| 43 |
-
f"Your revenue is strong at ${revenue:,.0f}, demonstrating clear market demand. "
|
| 44 |
-
f"A net margin of {margin:.1f}% is respectable, indicating your core unit economics are sound. "
|
| 45 |
-
f"With a Health Score of {data.health_score.total_score}/100, the business foundation is stable.\n\n"
|
| 46 |
-
"**The Bad:**\n"
|
| 47 |
-
f"We detected some potential liquidity friction locally. Your burn rate suggests you might have constrained runway if sales dip. "
|
| 48 |
-
"Optimization of COGS could yield an immediate 2-3% bottom-line improvement.\n\n"
|
| 49 |
-
"**The Ugly:**\n"
|
| 50 |
-
"No catastrophic risks detected immediately, but reliance on a single revenue stream could be a blind spot. "
|
| 51 |
-
"I recommend diversifying customer acquisition channels immediately to safeguard against volatility."
|
| 52 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/intelligence/gemini_service.py
DELETED
|
@@ -1,238 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
import os
|
| 3 |
-
import requests
|
| 4 |
-
import json
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
-
from app.schemas.chat import ChatRequest, ChatResponse
|
| 7 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 8 |
-
|
| 9 |
-
# Load .env file
|
| 10 |
-
load_dotenv()
|
| 11 |
-
|
| 12 |
-
class GeminiService:
|
| 13 |
-
API_KEY = os.getenv("GEMINI_API_KEY")
|
| 14 |
-
|
| 15 |
-
# Model fallback chain - try in order, fall back if quota exceeded
|
| 16 |
-
MODELS = [
|
| 17 |
-
"gemini-3-flash", # Primary - fastest, newest
|
| 18 |
-
"gemini-2.5-flash", # Fallback 1 - stable
|
| 19 |
-
"gemini-2.5-flash-lite", # Fallback 2 - lightweight
|
| 20 |
-
"gemini-2.0-flash", # Fallback 3 - legacy stable
|
| 21 |
-
]
|
| 22 |
-
|
| 23 |
-
# Track which models have hit quota in this session
|
| 24 |
-
_exhausted_models = set()
|
| 25 |
-
|
| 26 |
-
@classmethod
|
| 27 |
-
def _get_api_url(cls, model_name: str) -> str:
|
| 28 |
-
"""Generate API URL for a specific model."""
|
| 29 |
-
return f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={cls.API_KEY}"
|
| 30 |
-
|
| 31 |
-
@classmethod
|
| 32 |
-
def _reset_exhausted_models(cls):
|
| 33 |
-
"""Reset exhausted models (call periodically or on new day)."""
|
| 34 |
-
cls._exhausted_models.clear()
|
| 35 |
-
|
| 36 |
-
@staticmethod
|
| 37 |
-
def _parse_error_response(status_code: int, response_text: str) -> str:
|
| 38 |
-
"""
|
| 39 |
-
Parse API error responses and return clean, user-friendly messages.
|
| 40 |
-
Never expose raw JSON to users.
|
| 41 |
-
"""
|
| 42 |
-
if status_code == 429:
|
| 43 |
-
return "AI service is temporarily busy. Please try again in a few moments."
|
| 44 |
-
elif status_code == 401 or status_code == 403:
|
| 45 |
-
return "AI service authentication failed. Please check your API key configuration."
|
| 46 |
-
elif status_code == 400:
|
| 47 |
-
return "Invalid request to AI service. Please try a simpler query."
|
| 48 |
-
elif status_code == 500:
|
| 49 |
-
return "AI service is experiencing issues. Please try again later."
|
| 50 |
-
elif status_code == 503:
|
| 51 |
-
return "AI service is temporarily unavailable. Please try again later."
|
| 52 |
-
else:
|
| 53 |
-
return f"AI service returned an unexpected error (Code: {status_code}). Please try again."
|
| 54 |
-
|
| 55 |
-
@classmethod
|
| 56 |
-
def _try_request(cls, payload: dict, timeout: int = 30) -> tuple[bool, str, str]:
|
| 57 |
-
"""
|
| 58 |
-
Try to make a request using available models with automatic fallback.
|
| 59 |
-
Returns: (success: bool, response_text: str, model_used: str)
|
| 60 |
-
"""
|
| 61 |
-
if not cls.API_KEY:
|
| 62 |
-
return False, "Gemini API Key is missing. Please configure GEMINI_API_KEY.", ""
|
| 63 |
-
|
| 64 |
-
headers = {"Content-Type": "application/json"}
|
| 65 |
-
last_error = ""
|
| 66 |
-
|
| 67 |
-
for model in cls.MODELS:
|
| 68 |
-
# Skip models that have hit their quota this session
|
| 69 |
-
if model in cls._exhausted_models:
|
| 70 |
-
continue
|
| 71 |
-
|
| 72 |
-
try:
|
| 73 |
-
api_url = cls._get_api_url(model)
|
| 74 |
-
response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
|
| 75 |
-
|
| 76 |
-
if response.status_code == 200:
|
| 77 |
-
result = response.json()
|
| 78 |
-
try:
|
| 79 |
-
text = result['candidates'][0]['content']['parts'][0]['text']
|
| 80 |
-
return True, text, model
|
| 81 |
-
except (KeyError, IndexError):
|
| 82 |
-
last_error = "AI generated empty response."
|
| 83 |
-
continue
|
| 84 |
-
|
| 85 |
-
elif response.status_code == 429:
|
| 86 |
-
# Model quota exceeded - mark as exhausted and try next
|
| 87 |
-
cls._exhausted_models.add(model)
|
| 88 |
-
print(f"Model {model} quota exceeded, trying next model...")
|
| 89 |
-
last_error = "All AI models are currently at capacity."
|
| 90 |
-
continue
|
| 91 |
-
|
| 92 |
-
else:
|
| 93 |
-
# Other error - try next model
|
| 94 |
-
last_error = cls._parse_error_response(response.status_code, response.text)
|
| 95 |
-
continue
|
| 96 |
-
|
| 97 |
-
except requests.exceptions.Timeout:
|
| 98 |
-
last_error = "AI service timed out."
|
| 99 |
-
continue
|
| 100 |
-
except requests.exceptions.ConnectionError:
|
| 101 |
-
last_error = "Unable to connect to AI service."
|
| 102 |
-
continue
|
| 103 |
-
except Exception as e:
|
| 104 |
-
last_error = "An unexpected error occurred."
|
| 105 |
-
continue
|
| 106 |
-
|
| 107 |
-
# All models exhausted
|
| 108 |
-
return False, last_error, ""
|
| 109 |
-
|
| 110 |
-
@classmethod
|
| 111 |
-
def query(cls, request: ChatRequest, context_data: StandardizedDataPackage) -> ChatResponse:
|
| 112 |
-
if not cls.API_KEY:
|
| 113 |
-
return ChatResponse(response="Gemini API Key is missing. Please configure GEMINI_API_KEY in the backend.")
|
| 114 |
-
|
| 115 |
-
# Construct Prompt with Financial Context
|
| 116 |
-
system_prompt = f"""
|
| 117 |
-
You are Visique, an expert AI CFO. You are analyzing the financial data for {context_data.raw_data.company_name}.
|
| 118 |
-
|
| 119 |
-
Financial Context:
|
| 120 |
-
- Revenue: {context_data.raw_data.income_statement.revenue} {context_data.raw_data.currency}
|
| 121 |
-
- Net Income: {context_data.raw_data.income_statement.net_income}
|
| 122 |
-
- Cash Balance: {context_data.raw_data.balance_sheet.cash}
|
| 123 |
-
- Health Score: {context_data.health_score.total_score}/100
|
| 124 |
-
|
| 125 |
-
Key Insights:
|
| 126 |
-
{json.dumps(context_data.insights, indent=2)}
|
| 127 |
-
|
| 128 |
-
Optimization Insights (Heatmap/Dead Zones):
|
| 129 |
-
{json.dumps([z for z in context_data.optimization_insights.dead_zones] if context_data.optimization_insights else [], indent=2)}
|
| 130 |
-
|
| 131 |
-
User Question: {request.message}
|
| 132 |
-
|
| 133 |
-
Answer concisely as a CFO. If the user asks about "Dynamic Promos" or "Optimization", refer to the Dead Zones data.
|
| 134 |
-
"""
|
| 135 |
-
|
| 136 |
-
payload = {
|
| 137 |
-
"contents": [{
|
| 138 |
-
"parts": [{"text": system_prompt}]
|
| 139 |
-
}]
|
| 140 |
-
}
|
| 141 |
-
|
| 142 |
-
success, response_text, model_used = cls._try_request(payload)
|
| 143 |
-
|
| 144 |
-
if success:
|
| 145 |
-
return ChatResponse(response=response_text)
|
| 146 |
-
else:
|
| 147 |
-
return ChatResponse(response=response_text)
|
| 148 |
-
|
| 149 |
-
@classmethod
|
| 150 |
-
def generate_content(cls, prompt: str) -> str:
|
| 151 |
-
"""
|
| 152 |
-
Generic generator for internal services (like GeoService).
|
| 153 |
-
Uses automatic model fallback. Returns clean, presentable text.
|
| 154 |
-
"""
|
| 155 |
-
if not cls.API_KEY:
|
| 156 |
-
return "Strategic insights require AI configuration. Contact support for assistance."
|
| 157 |
-
|
| 158 |
-
payload = {
|
| 159 |
-
"contents": [{
|
| 160 |
-
"parts": [{"text": prompt}]
|
| 161 |
-
}]
|
| 162 |
-
}
|
| 163 |
-
|
| 164 |
-
success, response_text, model_used = cls._try_request(payload)
|
| 165 |
-
|
| 166 |
-
if success:
|
| 167 |
-
return response_text
|
| 168 |
-
else:
|
| 169 |
-
# Return intelligent fallback content instead of error
|
| 170 |
-
return cls._get_fallback_content(prompt)
|
| 171 |
-
|
| 172 |
-
@staticmethod
|
| 173 |
-
def _get_fallback_content(prompt: str) -> str:
|
| 174 |
-
"""
|
| 175 |
-
Provide meaningful fallback content when ALL AI models are unavailable.
|
| 176 |
-
This ensures reports and displays never show error messages.
|
| 177 |
-
"""
|
| 178 |
-
prompt_lower = prompt.lower()
|
| 179 |
-
|
| 180 |
-
if "competitor" in prompt_lower or "landscape" in prompt_lower:
|
| 181 |
-
return """**Market Analysis**
|
| 182 |
-
|
| 183 |
-
Based on industry standards for your sector:
|
| 184 |
-
|
| 185 |
-
• **Primary Competition**: Focus on businesses within a 5-mile radius offering similar services
|
| 186 |
-
• **Traffic Patterns**: Peak hours typically align with lunch (11am-2pm) and evening (5pm-8pm) periods
|
| 187 |
-
• **Differentiation**: Evaluate unique value propositions against local alternatives
|
| 188 |
-
|
| 189 |
-
*AI-powered real-time analysis available when capacity permits.*"""
|
| 190 |
-
|
| 191 |
-
elif "strategic" in prompt_lower or "context" in prompt_lower:
|
| 192 |
-
return """**Strategic Context Overview**
|
| 193 |
-
|
| 194 |
-
Key considerations for your market:
|
| 195 |
-
|
| 196 |
-
• **Regulatory Environment**: Stay current with local business regulations and licensing requirements
|
| 197 |
-
• **Economic Indicators**: Monitor regional employment and consumer spending trends
|
| 198 |
-
• **Industry Outlook**: Your sector shows stable fundamentals with growth potential
|
| 199 |
-
|
| 200 |
-
*Enhanced AI insights will be available shortly.*"""
|
| 201 |
-
|
| 202 |
-
elif "marketing" in prompt_lower or "growth" in prompt_lower:
|
| 203 |
-
return """**Growth Strategy Framework**
|
| 204 |
-
|
| 205 |
-
Recommended focus areas for sustainable growth:
|
| 206 |
-
|
| 207 |
-
• **Digital Presence**: Optimize Google Business Profile and local SEO
|
| 208 |
-
• **Customer Retention**: Implement loyalty programs to increase lifetime value
|
| 209 |
-
• **Community Engagement**: Partner with local organizations for visibility
|
| 210 |
-
|
| 211 |
-
*AI-powered personalized recommendations available when capacity permits.*"""
|
| 212 |
-
|
| 213 |
-
else:
|
| 214 |
-
return """**Analysis Summary**
|
| 215 |
-
|
| 216 |
-
Your financial data has been processed successfully. Key takeaways:
|
| 217 |
-
|
| 218 |
-
• Review the health score breakdown for areas of strength and improvement
|
| 219 |
-
• Monitor cash runway projections for operational planning
|
| 220 |
-
• Consider the recommendations provided for optimization opportunities
|
| 221 |
-
|
| 222 |
-
*For deeper AI-driven insights, please try again in a few minutes.*"""
|
| 223 |
-
|
| 224 |
-
@classmethod
|
| 225 |
-
def get_model_status(cls) -> dict:
|
| 226 |
-
"""
|
| 227 |
-
Get current status of available models (for debugging/admin).
|
| 228 |
-
"""
|
| 229 |
-
available_models = [m for m in cls.MODELS if m not in cls._exhausted_models]
|
| 230 |
-
exhausted = list(cls._exhausted_models)
|
| 231 |
-
|
| 232 |
-
return {
|
| 233 |
-
"total_models": len(cls.MODELS),
|
| 234 |
-
"available_models": available_models,
|
| 235 |
-
"exhausted_models": exhausted,
|
| 236 |
-
"all_exhausted": len(available_models) == 0
|
| 237 |
-
}
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/intelligence/geo_service.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
import random
|
| 3 |
-
|
| 4 |
-
class GeoService:
|
| 5 |
-
@staticmethod
|
| 6 |
-
def analyze_location(address: str, industry: str = "General", is_own_company: bool = False, company_name: str = ""):
|
| 7 |
-
"""
|
| 8 |
-
Generates strategic analysis using Google Gemini if available,
|
| 9 |
-
otherwise falls back to simulation.
|
| 10 |
-
|
| 11 |
-
:param address: The address to analyze
|
| 12 |
-
:param industry: The industry type
|
| 13 |
-
:param is_own_company: Whether this is the user's own company (enables more personalized insights)
|
| 14 |
-
:param company_name: Name of the company being analyzed
|
| 15 |
-
"""
|
| 16 |
-
from app.services.intelligence.gemini_service import GeminiService
|
| 17 |
-
|
| 18 |
-
context_prefix = f"for {company_name}" if company_name else ""
|
| 19 |
-
personalization = "your business" if is_own_company else f"this {industry} business"
|
| 20 |
-
|
| 21 |
-
# Check for Real AI Capability
|
| 22 |
-
if GeminiService.API_KEY:
|
| 23 |
-
try:
|
| 24 |
-
# 1. Competitor Landscape
|
| 25 |
-
p1 = f"Analyze the competitor landscape {context_prefix} for a {industry} business located at {address}. {'As the owner, provide actionable competitive intelligence.' if is_own_company else 'Provide general market context.'} Identify 3 competitors and describe the traffic patterns in the area. Limit to 150 words. Format with **Bold** headers."
|
| 26 |
-
comp_summary = GeminiService.generate_content(p1)
|
| 27 |
-
|
| 28 |
-
# 2. Strategic Context
|
| 29 |
-
p2 = f"Provide a brief strategic context analysis for {address} regarding local regulations, news events, and economic sentiment for the {industry} sector {context_prefix}. {'Include specific recommendations for the owner.' if is_own_company else ''} Limit to 150 words."
|
| 30 |
-
context_summary = GeminiService.generate_content(p2)
|
| 31 |
-
|
| 32 |
-
# 3. Marketing Strategy
|
| 33 |
-
p3 = f"Suggest a growth and marketing strategy for {personalization} at {address}. {'Be specific with actionable next steps for the owner to implement.' if is_own_company else 'Provide general market positioning advice.'} Include digital positioning advice and 2 actionable recommendations. Limit to 150 words."
|
| 34 |
-
marketing_summary = GeminiService.generate_content(p3)
|
| 35 |
-
|
| 36 |
-
return {
|
| 37 |
-
"competitor_analysis": comp_summary,
|
| 38 |
-
"strategic_context": context_summary,
|
| 39 |
-
"marketing_strategy": marketing_summary
|
| 40 |
-
}
|
| 41 |
-
except Exception as e:
|
| 42 |
-
print(f"Gemini Generation Failed: {e}. Falling back to simulation.")
|
| 43 |
-
# Fallthrough to default logic below
|
| 44 |
-
|
| 45 |
-
# ... FALLBACK MOCK DATA ...
|
| 46 |
-
# Mocking external data capabilities
|
| 47 |
-
competitors = [
|
| 48 |
-
"Alpha Competitor Inc.", "Beta Rivals LLC", "Local Market Leader"
|
| 49 |
-
] if industry != "Restaurant" else [
|
| 50 |
-
"The Hungry Chef", "Burger King", "Downtown Bistro"
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
ownership_note = "As the owner of this business," if is_own_company else "For this business,"
|
| 54 |
-
company_ref = company_name if company_name else "the business"
|
| 55 |
-
|
| 56 |
-
# 1. Competitor & Location Analysis (Page 1 content)
|
| 57 |
-
comp_summary = f"""
|
| 58 |
-
**Location Analysis for:** {address}
|
| 59 |
-
**Company:** {company_ref}
|
| 60 |
-
**Industry Focus:** {industry}
|
| 61 |
-
|
| 62 |
-
**Competitor Landscape:**
|
| 63 |
-
{ownership_note} we have identified {len(competitors)} primary competitors within a 5-mile radius:
|
| 64 |
-
{', '.join(competitors)}.
|
| 65 |
-
|
| 66 |
-
**Traffic Patterns:**
|
| 67 |
-
Based on historical data, the highest foot traffic in your area occurs between 11:00 AM and 2:00 PM on weekdays.
|
| 68 |
-
|
| 69 |
-
**Site Accessibility:**
|
| 70 |
-
Your location has a Walk Score of {random.randint(40, 95)}/100 and Transit Score of {random.randint(30, 80)}/100.
|
| 71 |
-
"""
|
| 72 |
-
|
| 73 |
-
# 2. Political & Local News Context (Page 2 content)
|
| 74 |
-
context_summary = f"""
|
| 75 |
-
**Strategic Context: Local & Political Landscape**
|
| 76 |
-
|
| 77 |
-
**Regulatory Updates:**
|
| 78 |
-
Recent city council proceedings indicate a favorable shift for {industry} businesses.
|
| 79 |
-
|
| 80 |
-
**Economic Sentiment:**
|
| 81 |
-
Local consumer sentiment is currently 'Optimistic' with a spending index of {random.randint(90, 110)}.
|
| 82 |
-
|
| 83 |
-
{"**Owner Action Item:** Engage with local business association for networking opportunities." if is_own_company else ""}
|
| 84 |
-
"""
|
| 85 |
-
|
| 86 |
-
# 3. Marketing & Growth Opportunities (Page 3 content)
|
| 87 |
-
marketing_summary = f"""
|
| 88 |
-
**Growth & Marketing Strategy for {company_ref}**
|
| 89 |
-
|
| 90 |
-
**Key Marketing Events:**
|
| 91 |
-
Leverage upcoming local opportunities like the Annual City Festival.
|
| 92 |
-
|
| 93 |
-
**Actionable Recommendations:**
|
| 94 |
-
1. **Hyper-Local SEO:** {"Optimize your" if is_own_company else "Optimize the"} Google Business Profile for '{company_ref}'.
|
| 95 |
-
2. **Community Partnerships:** Engage with local news events and neighborhood associations.
|
| 96 |
-
{"3. **Owner Priority:** Focus on building customer reviews - aim for 50+ 5-star reviews." if is_own_company else ""}
|
| 97 |
-
"""
|
| 98 |
-
|
| 99 |
-
return {
|
| 100 |
-
"competitor_analysis": comp_summary,
|
| 101 |
-
"strategic_context": context_summary,
|
| 102 |
-
"marketing_strategy": marketing_summary
|
| 103 |
-
}
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/intelligence/rag.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
from app.schemas.chat import ChatRequest, ChatResponse, Message
|
| 2 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 3 |
-
|
| 4 |
-
class RAGService:
|
| 5 |
-
@staticmethod
|
| 6 |
-
def query(request: ChatRequest, data_context: StandardizedDataPackage) -> ChatResponse:
|
| 7 |
-
"""
|
| 8 |
-
Scaffolds the RAG logic.
|
| 9 |
-
In a real implementation, this would:
|
| 10 |
-
1. Chunk the 'data_context' into vectors (Income, Balance, Risk).
|
| 11 |
-
2. Embed the 'request.messages[-1].content'.
|
| 12 |
-
3. Retrieve relevant chunks.
|
| 13 |
-
4. Synthesize an answer via LLM.
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
last_message = request.messages[-1].content.lower()
|
| 17 |
-
|
| 18 |
-
# Simple Keyword Matching (Mock RAG)
|
| 19 |
-
extracted_info = []
|
| 20 |
-
if "revenue" in last_message:
|
| 21 |
-
extracted_info.append(f"Revenue: ${data_context.raw_data.income_statement.revenue:,.2f}")
|
| 22 |
-
if "net income" in last_message or "profit" in last_message:
|
| 23 |
-
extracted_info.append(f"Net Income: ${data_context.raw_data.income_statement.net_income:,.2f}")
|
| 24 |
-
if "margin" in last_message:
|
| 25 |
-
extracted_info.append(f"Net Margin: {data_context.kpis.net_margin}%")
|
| 26 |
-
|
| 27 |
-
if not extracted_info:
|
| 28 |
-
response_text = "I am a financial AI. Ask me about Revenue, Margins, or Risk."
|
| 29 |
-
else:
|
| 30 |
-
response_text = "Based on the latest financial data:\n- " + "\n- ".join(extracted_info)
|
| 31 |
-
|
| 32 |
-
return ChatResponse(
|
| 33 |
-
response=response_text,
|
| 34 |
-
sources=["Financial Report Q4", "KPI Analysis Module"]
|
| 35 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/reporting/pdf_report.py
DELETED
|
@@ -1,565 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
from reportlab.lib.pagesizes import letter
|
| 4 |
-
from reportlab.lib import colors
|
| 5 |
-
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 6 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Frame, PageTemplate, Image
|
| 7 |
-
from reportlab.lib.units import inch
|
| 8 |
-
from reportlab.pdfgen import canvas
|
| 9 |
-
from reportlab.graphics.shapes import Drawing
|
| 10 |
-
from reportlab.graphics.charts.barcharts import VerticalBarChart
|
| 11 |
-
from reportlab.graphics.charts.linecharts import HorizontalLineChart
|
| 12 |
-
from reportlab.graphics.charts.piecharts import Pie
|
| 13 |
-
from reportlab.lib.colors import HexColor
|
| 14 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 15 |
-
import os
|
| 16 |
-
import re
|
| 17 |
-
from datetime import datetime
|
| 18 |
-
from pypdf import PdfReader, PdfWriter, PageObject
|
| 19 |
-
import io
|
| 20 |
-
|
| 21 |
-
class PDFReporter:
|
| 22 |
-
|
| 23 |
-
TEMPLATE_PATH = "app/assets/report_template.pdf"
|
| 24 |
-
|
| 25 |
-
@staticmethod
|
| 26 |
-
def _sanitize_content(text: str) -> str:
|
| 27 |
-
"""Clean AI-generated content."""
|
| 28 |
-
if not text:
|
| 29 |
-
return ""
|
| 30 |
-
|
| 31 |
-
# Remove JSON blocks and API error responses
|
| 32 |
-
text = re.sub(r'\{[^}]*"@type"[^}]*\}', '', text)
|
| 33 |
-
text = re.sub(r'\{[^}]*"quotaMetric"[^}]*\}', '', text)
|
| 34 |
-
text = re.sub(r'\[\s*\{.*?\}\s*\]', '', text, flags=re.DOTALL)
|
| 35 |
-
text = re.sub(r'"[a-zA-Z_]+"\s*:\s*"[^"]*"', '', text)
|
| 36 |
-
text = re.sub(r'AI Error:\s*\d+.*', '', text)
|
| 37 |
-
text = re.sub(r'System Error:.*', '', text)
|
| 38 |
-
text = re.sub(r'https?://[^\s]+', '', text)
|
| 39 |
-
|
| 40 |
-
# Clean up markdown formatting
|
| 41 |
-
text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text) # Bold
|
| 42 |
-
text = text.replace("##", "").replace("###", "").replace("#", "")
|
| 43 |
-
text = re.sub(r'(?<!\S)\*\s+', '• ', text) # Bullet points
|
| 44 |
-
|
| 45 |
-
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 46 |
-
text = re.sub(r'[ \t]+', ' ', text)
|
| 47 |
-
text = text.strip()
|
| 48 |
-
text = re.sub(r'[\[\]{}]', '', text)
|
| 49 |
-
|
| 50 |
-
return text if text else "Analysis data will be available upon API configuration."
|
| 51 |
-
|
| 52 |
-
@staticmethod
|
| 53 |
-
def _create_pie_chart(data_dict, title="Breakdown", width=400, height=200):
|
| 54 |
-
"""Create a Pie Chart."""
|
| 55 |
-
drawing = Drawing(width, height)
|
| 56 |
-
pc = Pie()
|
| 57 |
-
# Scaling - constrain by the smaller dimension with margin
|
| 58 |
-
size = min(width, height) - 20
|
| 59 |
-
if size < 50: size = 50 # Minimum size
|
| 60 |
-
|
| 61 |
-
pc.width = size
|
| 62 |
-
pc.height = size
|
| 63 |
-
pc.x = (width - size) / 2
|
| 64 |
-
pc.y = (height - size) / 2
|
| 65 |
-
|
| 66 |
-
# Filter zero values
|
| 67 |
-
labels = []
|
| 68 |
-
data = []
|
| 69 |
-
for k, v in data_dict.items():
|
| 70 |
-
if v and v > 0:
|
| 71 |
-
labels.append(k)
|
| 72 |
-
data.append(v)
|
| 73 |
-
|
| 74 |
-
if not data:
|
| 75 |
-
return drawing
|
| 76 |
-
|
| 77 |
-
pc.data = data
|
| 78 |
-
pc.labels = labels
|
| 79 |
-
pc.slices.strokeWidth = 0.5
|
| 80 |
-
|
| 81 |
-
# Visique Colors
|
| 82 |
-
colors_list = [
|
| 83 |
-
HexColor("#0891b2"), HexColor("#0f172a"), HexColor("#38bdf8"),
|
| 84 |
-
HexColor("#94a3b8"), HexColor("#cffafe")
|
| 85 |
-
]
|
| 86 |
-
for i in range(len(data)):
|
| 87 |
-
pc.slices[i].fillColor = colors_list[i % len(colors_list)]
|
| 88 |
-
|
| 89 |
-
drawing.add(pc)
|
| 90 |
-
return drawing
|
| 91 |
-
|
| 92 |
-
@staticmethod
|
| 93 |
-
def _create_chart_with_description(data: StandardizedDataPackage, type='revenue', width=400, height=200):
|
| 94 |
-
"""Creates charts for the report with centering and descriptions. Width/Height control drawing size."""
|
| 95 |
-
drawing = Drawing(width, height)
|
| 96 |
-
desc_text = ""
|
| 97 |
-
|
| 98 |
-
if type == 'revenue':
|
| 99 |
-
rev = data.raw_data.income_statement.revenue or 0
|
| 100 |
-
exp = data.raw_data.income_statement.operating_expenses or 0
|
| 101 |
-
net = data.raw_data.income_statement.net_income or 0
|
| 102 |
-
|
| 103 |
-
# Description Logic
|
| 104 |
-
margin = (net / rev * 100) if rev else 0
|
| 105 |
-
if rev >= 1e9:
|
| 106 |
-
rev_str = f"${rev/1e9:.1f}B"
|
| 107 |
-
else:
|
| 108 |
-
rev_str = f"${rev/1e6:.1f}M"
|
| 109 |
-
|
| 110 |
-
desc_text = f"<b>Performance Overview:</b> The company generated <b>{rev_str}</b> in revenue. <br/>Net profit margin stands at <b>{margin:.1f}%</b> after expenses."
|
| 111 |
-
|
| 112 |
-
data_vals = [(rev, exp, net)]
|
| 113 |
-
bc = VerticalBarChart()
|
| 114 |
-
bc.x = width * 0.15 # dynamic margin
|
| 115 |
-
bc.y = 50
|
| 116 |
-
bc.height = height * 0.6
|
| 117 |
-
bc.width = width * 0.75
|
| 118 |
-
bc.data = data_vals
|
| 119 |
-
bc.strokeColor = colors.white
|
| 120 |
-
max_val = max(rev, exp, net, 100)
|
| 121 |
-
bc.valueAxis.valueMin = 0
|
| 122 |
-
bc.valueAxis.valueMax = max_val * 1.1
|
| 123 |
-
bc.valueAxis.valueStep = max_val / 4
|
| 124 |
-
bc.valueAxis.labelTextFormat = lambda x: f'{x/1e9:.1f}B' if x >= 1e9 else f'{x/1e6:.0f}M'
|
| 125 |
-
|
| 126 |
-
bc.categoryAxis.categoryNames = ['Revenue', 'Op. Expenses', 'Net Income']
|
| 127 |
-
bc.bars[0].fillColor = colors.HexColor("#0891b2")
|
| 128 |
-
drawing.add(bc)
|
| 129 |
-
|
| 130 |
-
elif type == 'runway':
|
| 131 |
-
if data.runway_forecast:
|
| 132 |
-
burn = abs(data.runway_forecast.burn_rate_monthly or 0)
|
| 133 |
-
months = min(data.runway_forecast.months_left or 0, 24)
|
| 134 |
-
cash = data.raw_data.balance_sheet.cash or 0
|
| 135 |
-
|
| 136 |
-
desc_text = f"<b>Cash Runway:</b> Based on a monthly burn of <b>${burn:,.0f}</b>,<br/>cash reserves will support operations for <b>{months:.1f} months</b>."
|
| 137 |
-
|
| 138 |
-
if burn > 0:
|
| 139 |
-
projection = [max(0, cash - (burn * i)) for i in range(int(months) + 2)]
|
| 140 |
-
lc = HorizontalLineChart()
|
| 141 |
-
lc.x = width * 0.15
|
| 142 |
-
lc.y = 50
|
| 143 |
-
lc.height = height * 0.6
|
| 144 |
-
lc.width = width * 0.75
|
| 145 |
-
lc.data = [projection]
|
| 146 |
-
lc.joinedLines = 1
|
| 147 |
-
lc.categoryAxis.categoryNames = [f"M{i}" for i in range(len(projection))]
|
| 148 |
-
lc.valueAxis.valueMin = 0
|
| 149 |
-
lc.lines[0].strokeColor = colors.HexColor("#06b6d4")
|
| 150 |
-
lc.lines[0].strokeWidth = 2
|
| 151 |
-
drawing.add(lc)
|
| 152 |
-
else:
|
| 153 |
-
desc_text = "Runway data unavailable."
|
| 154 |
-
|
| 155 |
-
elif type == 'expenses_pie':
|
| 156 |
-
expenses = {
|
| 157 |
-
"COGS": data.raw_data.income_statement.cogs,
|
| 158 |
-
"Payroll": data.raw_data.income_statement.payroll_expenses,
|
| 159 |
-
"Marketing": data.raw_data.income_statement.marketing_expenses,
|
| 160 |
-
"Rent": data.raw_data.income_statement.rent_expense,
|
| 161 |
-
"Other": data.raw_data.income_statement.other_operating_expenses
|
| 162 |
-
}
|
| 163 |
-
drawing = PDFReporter._create_pie_chart(expenses, width=width, height=height)
|
| 164 |
-
desc_text = "<b>Expense Profile:</b> Breakdown of major cost centers.<br/>Monitor COGS and Payroll trends."
|
| 165 |
-
|
| 166 |
-
elif type == 'assets_pie':
|
| 167 |
-
assets = {
|
| 168 |
-
"Cash": data.raw_data.balance_sheet.cash,
|
| 169 |
-
"Receivables": data.raw_data.balance_sheet.accounts_receivable,
|
| 170 |
-
"Inventory": data.raw_data.balance_sheet.inventory,
|
| 171 |
-
"Property/Eq": data.raw_data.balance_sheet.property_plant_equipment,
|
| 172 |
-
"Intangibles": data.raw_data.balance_sheet.intangible_assets,
|
| 173 |
-
}
|
| 174 |
-
drawing = PDFReporter._create_pie_chart(assets, width=width, height=height)
|
| 175 |
-
desc_text = "<b>Asset Mix:</b> Composition of short vs long term assets.<br/>Liquidity is key for stability."
|
| 176 |
-
|
| 177 |
-
# Wrapper Table for Centering & Description
|
| 178 |
-
styles = getSampleStyleSheet()
|
| 179 |
-
desc_style = ParagraphStyle('ChartDesc', parent=styles['Normal'], fontSize=9, leading=11, alignment=1, textColor=colors.HexColor("#64748b"))
|
| 180 |
-
|
| 181 |
-
t = Table([[drawing], [Paragraph(desc_text, desc_style)]], colWidths=[width], rowHeights=[height+10, 40])
|
| 182 |
-
t.setStyle(TableStyle([
|
| 183 |
-
('ALIGN', (0,0), (-1,-1), 'CENTER'),
|
| 184 |
-
('VALIGN', (0,0), (-1,-1), 'TOP'),
|
| 185 |
-
]))
|
| 186 |
-
return t
|
| 187 |
-
|
| 188 |
-
@staticmethod
|
| 189 |
-
def cover_template_header(canvas, doc):
|
| 190 |
-
"""Draws a white box to mask the template's placeholder text."""
|
| 191 |
-
canvas.saveState()
|
| 192 |
-
canvas.setFillColor(colors.white)
|
| 193 |
-
# Position: Top of page, below logo, covering center mess
|
| 194 |
-
# Page height is 11 inch = 792 pt
|
| 195 |
-
# Logo is usually at top 1 inch (y=720+).
|
| 196 |
-
# Placeholders "Company Name" etc usually around y=700.
|
| 197 |
-
canvas.rect(0, 680, 612, 60, fill=1, stroke=0)
|
| 198 |
-
canvas.restoreState()
|
| 199 |
-
|
| 200 |
-
@staticmethod
|
| 201 |
-
def _create_chart(data: StandardizedDataPackage, type='revenue'):
|
| 202 |
-
"""Creates charts for the report."""
|
| 203 |
-
drawing = Drawing(400, 200)
|
| 204 |
-
|
| 205 |
-
if type == 'revenue':
|
| 206 |
-
# Revenue vs Expenses vs Net Income
|
| 207 |
-
rev = data.raw_data.income_statement.revenue or 0
|
| 208 |
-
exp = data.raw_data.income_statement.operating_expenses or 0
|
| 209 |
-
net = data.raw_data.income_statement.net_income or 0
|
| 210 |
-
|
| 211 |
-
data_vals = [(rev, exp, net)]
|
| 212 |
-
|
| 213 |
-
bc = VerticalBarChart()
|
| 214 |
-
bc.x = 50
|
| 215 |
-
bc.y = 50
|
| 216 |
-
bc.height = 125
|
| 217 |
-
bc.width = 300
|
| 218 |
-
bc.data = data_vals
|
| 219 |
-
bc.strokeColor = colors.white
|
| 220 |
-
|
| 221 |
-
# Dynamic axis scaling
|
| 222 |
-
max_val = max(rev, exp, net, 100)
|
| 223 |
-
bc.valueAxis.valueMin = 0
|
| 224 |
-
bc.valueAxis.valueMax = max_val * 1.1
|
| 225 |
-
bc.valueAxis.valueStep = max_val / 5
|
| 226 |
-
|
| 227 |
-
bc.categoryAxis.labels.boxAnchor = 'ne'
|
| 228 |
-
bc.categoryAxis.labels.dx = 8
|
| 229 |
-
bc.categoryAxis.labels.dy = -2
|
| 230 |
-
bc.categoryAxis.categoryNames = ['Revenue', 'Op. Expenses', 'Net Income']
|
| 231 |
-
bc.bars[0].fillColor = colors.HexColor("#0891b2")
|
| 232 |
-
drawing.add(bc)
|
| 233 |
-
|
| 234 |
-
elif type == 'runway':
|
| 235 |
-
# Simple burn rate projection
|
| 236 |
-
burn = 0
|
| 237 |
-
months = 0
|
| 238 |
-
|
| 239 |
-
if data.runway_forecast:
|
| 240 |
-
burn = abs(data.runway_forecast.burn_rate_monthly or 0)
|
| 241 |
-
months = min(data.runway_forecast.months_left or 0, 24)
|
| 242 |
-
|
| 243 |
-
cash = data.raw_data.balance_sheet.cash or 0
|
| 244 |
-
|
| 245 |
-
# Projected cash balance line
|
| 246 |
-
if burn > 0:
|
| 247 |
-
projection = [max(0, cash - (burn * i)) for i in range(int(months) + 2)]
|
| 248 |
-
|
| 249 |
-
lc = HorizontalLineChart()
|
| 250 |
-
lc.x = 50
|
| 251 |
-
lc.y = 50
|
| 252 |
-
lc.height = 125
|
| 253 |
-
lc.width = 300
|
| 254 |
-
lc.data = [projection]
|
| 255 |
-
lc.joinedLines = 1
|
| 256 |
-
lc.categoryAxis.categoryNames = [f"M{i}" for i in range(len(projection))]
|
| 257 |
-
lc.valueAxis.valueMin = 0
|
| 258 |
-
lc.lines[0].strokeColor = colors.HexColor("#06b6d4")
|
| 259 |
-
lc.lines[0].strokeWidth = 2
|
| 260 |
-
drawing.add(lc)
|
| 261 |
-
|
| 262 |
-
return drawing
|
| 263 |
-
|
| 264 |
-
@staticmethod
|
| 265 |
-
def generate(data: StandardizedDataPackage, filename: str):
|
| 266 |
-
# 1. Generate content PDF using ReportLab
|
| 267 |
-
packet = io.BytesIO()
|
| 268 |
-
doc = SimpleDocTemplate(
|
| 269 |
-
packet,
|
| 270 |
-
pagesize=letter,
|
| 271 |
-
rightMargin=inch,
|
| 272 |
-
leftMargin=inch,
|
| 273 |
-
topMargin=1.5*inch, # More space for header
|
| 274 |
-
bottomMargin=1*inch
|
| 275 |
-
)
|
| 276 |
-
|
| 277 |
-
# Styles
|
| 278 |
-
styles = getSampleStyleSheet()
|
| 279 |
-
|
| 280 |
-
title_style = ParagraphStyle('VisiqueTitle', parent=styles['Heading1'], fontSize=26, textColor=colors.HexColor("#0f172a"), spaceAfter=25, fontName='Helvetica-Bold')
|
| 281 |
-
section_style = ParagraphStyle('VisiqueSection', parent=styles['Heading1'], fontSize=30, textColor=colors.HexColor("#0f172a"), spaceBefore=100, spaceAfter=20, alignment=1, fontName='Helvetica-Bold')
|
| 282 |
-
header_style = ParagraphStyle('VisiqueHeader', parent=styles['Heading2'], fontSize=16, textColor=colors.HexColor("#334155"), spaceBefore=20, spaceAfter=10, keepWithNext=True, fontName='Helvetica-Bold')
|
| 283 |
-
body_style = ParagraphStyle('VisiqueBody', parent=styles['Normal'], fontSize=11, leading=15, spaceAfter=10, textColor=colors.HexColor("#334155"), fontName='Helvetica')
|
| 284 |
-
score_style = ParagraphStyle('ScoreStyle', parent=styles['Normal'], fontSize=32, leading=36, textColor=colors.HexColor("#0ea5e9"), alignment=1, fontName='Helvetica-Bold')
|
| 285 |
-
tiny_meta = ParagraphStyle('TinyMeta', parent=styles['Normal'], fontSize=8, textColor=colors.gray)
|
| 286 |
-
|
| 287 |
-
elements = []
|
| 288 |
-
|
| 289 |
-
# === PAGE 1: EXECUTIVE SUMMARY ===
|
| 290 |
-
elements.append(Paragraph(f"Financial Intelligence Report", title_style))
|
| 291 |
-
elements.append(Paragraph(f"<b>Target Entity:</b> {data.raw_data.company_name}", body_style))
|
| 292 |
-
elements.append(Paragraph(f"<b>Reporting Period:</b> {data.raw_data.period_end}", body_style))
|
| 293 |
-
elements.append(Spacer(1, 20))
|
| 294 |
-
|
| 295 |
-
# Health Score Box
|
| 296 |
-
elements.append(Paragraph("Strategic Health Score", header_style))
|
| 297 |
-
score_data = [[
|
| 298 |
-
Paragraph(f"<b>{data.health_score.total_score:.0f}</b> / 100", score_style),
|
| 299 |
-
[
|
| 300 |
-
Paragraph(f"• Stability: {data.health_score.stability:.0f}/25", body_style),
|
| 301 |
-
Paragraph(f"• Profitability: {data.health_score.profitability:.0f}/35", body_style),
|
| 302 |
-
Paragraph(f"• Growth: {data.health_score.growth:.0f}/10", body_style),
|
| 303 |
-
Paragraph(f"• Efficiency: {data.health_score.efficiency:.0f}/20", body_style),
|
| 304 |
-
]
|
| 305 |
-
]]
|
| 306 |
-
score_table = Table(score_data, colWidths=[2*inch, 4*inch])
|
| 307 |
-
score_table.setStyle(TableStyle([
|
| 308 |
-
('ALIGN', (0,0), (0,0), 'CENTER'),
|
| 309 |
-
('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
|
| 310 |
-
('BOX', (0,0), (0,0), 1, colors.HexColor("#0891b2")),
|
| 311 |
-
('ROUNDEDCORNERS', [10, 10, 10, 10]),
|
| 312 |
-
('TOPPADDING', (0,0), (-1,-1), 15),
|
| 313 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 15),
|
| 314 |
-
]))
|
| 315 |
-
elements.append(score_table)
|
| 316 |
-
elements.append(Spacer(1, 15))
|
| 317 |
-
|
| 318 |
-
# Executive Insights (Top 3)
|
| 319 |
-
# Replacing simple list with Risk/Win Table
|
| 320 |
-
|
| 321 |
-
wins = []
|
| 322 |
-
risks = []
|
| 323 |
-
|
| 324 |
-
# Parse insights for win/risk
|
| 325 |
-
if data.insights:
|
| 326 |
-
for insight in data.insights:
|
| 327 |
-
if any(x in insight for x in ["Risk", "High", "Decrease", "Burn", "Negative"]):
|
| 328 |
-
if len(risks) < 3: risks.append(insight)
|
| 329 |
-
else:
|
| 330 |
-
if len(wins) < 3: wins.append(insight)
|
| 331 |
-
|
| 332 |
-
# Ensure at least some data
|
| 333 |
-
if not wins: wins = ["Stable operations detected."]
|
| 334 |
-
if not risks: risks = ["No critical risks detected."]
|
| 335 |
-
|
| 336 |
-
rw_data = [
|
| 337 |
-
[Paragraph("<b>Key Wins</b>", body_style), Paragraph("<b>Risk Factors</b>", body_style)],
|
| 338 |
-
[[Paragraph(f"�� {PDFReporter._sanitize_content(w)}", body_style) for w in wins],
|
| 339 |
-
[Paragraph(f"• {PDFReporter._sanitize_content(r)}", body_style) for r in risks]]
|
| 340 |
-
]
|
| 341 |
-
|
| 342 |
-
rw_table = Table(rw_data, colWidths=[3*inch, 3*inch])
|
| 343 |
-
rw_table.setStyle(TableStyle([
|
| 344 |
-
('BACKGROUND', (0,0), (0,0), colors.HexColor("#dcfce7")), # Light Green
|
| 345 |
-
('BACKGROUND', (1,0), (1,0), colors.HexColor("#fee2e2")), # Light Red
|
| 346 |
-
('VALIGN', (0,0), (-1,-1), 'TOP'),
|
| 347 |
-
('GRID', (0,0), (-1,-1), 0.5, colors.grey),
|
| 348 |
-
('TOPPADDING', (0,0), (-1,-1), 6),
|
| 349 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 6),
|
| 350 |
-
]))
|
| 351 |
-
elements.append(rw_table)
|
| 352 |
-
|
| 353 |
-
elements.append(PageBreak())
|
| 354 |
-
|
| 355 |
-
# === PAGE 2: INCOME STATEMENT ===
|
| 356 |
-
elements.append(Paragraph("Income Statement Analysis", title_style))
|
| 357 |
-
|
| 358 |
-
# Charts Row - Now using description wrapper which is a Table itself
|
| 359 |
-
# To side-by-side, we need a wrapper table
|
| 360 |
-
# Page width 8.5in. Margins 1in. Content = 6.5in.
|
| 361 |
-
# Split 2 cols = 3.25in each = ~234 points.
|
| 362 |
-
col_w = 3.2 * inch
|
| 363 |
-
|
| 364 |
-
c1 = PDFReporter._create_chart_with_description(data, 'revenue', width=220, height=180)
|
| 365 |
-
c2 = PDFReporter._create_chart_with_description(data, 'expenses_pie', width=220, height=180)
|
| 366 |
-
|
| 367 |
-
chart_container = Table([[c1, c2]], colWidths=[col_w, col_w])
|
| 368 |
-
chart_container.setStyle(TableStyle([
|
| 369 |
-
('ALIGN', (0,0), (-1,-1), 'CENTER'),
|
| 370 |
-
('VALIGN', (0,0), (-1,-1), 'TOP'),
|
| 371 |
-
]))
|
| 372 |
-
elements.append(chart_container)
|
| 373 |
-
|
| 374 |
-
elements.append(Spacer(1, 15))
|
| 375 |
-
|
| 376 |
-
income_data = [
|
| 377 |
-
["Metric", "Value"],
|
| 378 |
-
["Revenue", f"${data.raw_data.income_statement.revenue:,.2f}"],
|
| 379 |
-
["COGS", f"${data.raw_data.income_statement.cogs:,.2f}"],
|
| 380 |
-
["Gross Profit", f"${data.raw_data.income_statement.gross_profit:,.2f}"],
|
| 381 |
-
["Op. Expenses", f"${data.raw_data.income_statement.operating_expenses:,.2f}"],
|
| 382 |
-
["Net Income", f"${data.raw_data.income_statement.net_income:,.2f}"],
|
| 383 |
-
]
|
| 384 |
-
t_income = Table(income_data, colWidths=[3.5*inch, 2.5*inch])
|
| 385 |
-
t_income.setStyle(TableStyle([
|
| 386 |
-
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#0891b2")),
|
| 387 |
-
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
| 388 |
-
('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
|
| 389 |
-
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 390 |
-
('GRID', (0, 0), (-1, -1), 1, colors.HexColor("#e2e8f0")),
|
| 391 |
-
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
|
| 392 |
-
('TOPPADDING', (0,0), (-1,-1), 8),
|
| 393 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 8),
|
| 394 |
-
]))
|
| 395 |
-
elements.append(t_income)
|
| 396 |
-
elements.append(PageBreak())
|
| 397 |
-
|
| 398 |
-
# === PAGE 3: BALANCE SHEET ===
|
| 399 |
-
elements.append(Paragraph("Balance Sheet & Ratios", title_style))
|
| 400 |
-
|
| 401 |
-
# Add Asset Chart (Centered)
|
| 402 |
-
c_assets = PDFReporter._create_chart_with_description(data, 'assets_pie')
|
| 403 |
-
# Center horizontally
|
| 404 |
-
t_asset_wrapper = Table([[c_assets]], colWidths=[7*inch])
|
| 405 |
-
t_asset_wrapper.setStyle(TableStyle([('ALIGN', (0,0), (-1,-1), 'CENTER')]))
|
| 406 |
-
elements.append(t_asset_wrapper)
|
| 407 |
-
|
| 408 |
-
elements.append(Spacer(1, 10))
|
| 409 |
-
|
| 410 |
-
kpi_data = [["Key Ratio", "Value", "Benchmark"]]
|
| 411 |
-
if data.kpis:
|
| 412 |
-
kpi_data.append(["Current Ratio", f"{data.kpis.current_ratio:.2f}x", "> 1.5x"])
|
| 413 |
-
kpi_data.append(["Debt-to-Equity", f"{data.kpis.debt_to_equity:.2f}x", "< 2.0x"])
|
| 414 |
-
kpi_data.append(["Return on Equity", f"{data.kpis.roe:.1%}", "15-20%"])
|
| 415 |
-
kpi_data.append(["DSO", f"{data.kpis.dso:.0f} days", "< 45 days"])
|
| 416 |
-
|
| 417 |
-
t_kpi = Table(kpi_data, colWidths=[2.5*inch, 1.5*inch, 1.5*inch])
|
| 418 |
-
t_kpi.setStyle(TableStyle([
|
| 419 |
-
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#0f172a")),
|
| 420 |
-
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
| 421 |
-
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 422 |
-
('GRID', (0, 0), (-1, -1), 1, colors.HexColor("#e2e8f0")),
|
| 423 |
-
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor("#f1f5f9")]),
|
| 424 |
-
('TOPPADDING', (0,0), (-1,-1), 8),
|
| 425 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 8),
|
| 426 |
-
]))
|
| 427 |
-
elements.append(t_kpi)
|
| 428 |
-
elements.append(Spacer(1, 20))
|
| 429 |
-
|
| 430 |
-
bs_data = [
|
| 431 |
-
["Balance Sheet Item", "Value"],
|
| 432 |
-
["Total Assets", f"${data.raw_data.balance_sheet.total_assets:,.2f}"],
|
| 433 |
-
[" Cash & Equiv.", f"${data.raw_data.balance_sheet.cash:,.2f}"],
|
| 434 |
-
["Total Liabilities", f"${data.raw_data.balance_sheet.total_liabilities:,.2f}"],
|
| 435 |
-
[" Short Term Debt", f"${data.raw_data.balance_sheet.short_term_debt:,.2f}"],
|
| 436 |
-
[" Long Term Debt", f"${data.raw_data.balance_sheet.long_term_debt:,.2f}"],
|
| 437 |
-
["Total Equity", f"${data.raw_data.balance_sheet.total_equity:,.2f}"],
|
| 438 |
-
]
|
| 439 |
-
t_bs = Table(bs_data, colWidths=[3.5*inch, 2.5*inch])
|
| 440 |
-
t_bs.setStyle(TableStyle([
|
| 441 |
-
('LINEBELOW', (0,0), (-1,0), 1, colors.black),
|
| 442 |
-
('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
|
| 443 |
-
('TOPPADDING', (0,0), (-1,-1), 6),
|
| 444 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 6),
|
| 445 |
-
]))
|
| 446 |
-
elements.append(t_bs)
|
| 447 |
-
elements.append(PageBreak())
|
| 448 |
-
|
| 449 |
-
# === PAGE 4: CASH FLOW & RUNWAY ===
|
| 450 |
-
elements.append(Paragraph("Cash Flow & Runway", title_style))
|
| 451 |
-
|
| 452 |
-
if data.runway_forecast and data.runway_forecast.burn_rate_monthly > 0:
|
| 453 |
-
c_runway = PDFReporter._create_chart_with_description(data, 'runway')
|
| 454 |
-
t_runway_wrapper = Table([[c_runway]], colWidths=[7*inch])
|
| 455 |
-
t_runway_wrapper.setStyle(TableStyle([('ALIGN', (0,0), (-1,-1), 'CENTER')]))
|
| 456 |
-
elements.append(t_runway_wrapper)
|
| 457 |
-
else:
|
| 458 |
-
elements.append(Paragraph("Positive Cash Flow Generation", header_style))
|
| 459 |
-
elements.append(Paragraph("This entity is cash flow positive and does not have a finite runway.", body_style))
|
| 460 |
-
|
| 461 |
-
elements.append(Spacer(1, 20))
|
| 462 |
-
cf_data = [
|
| 463 |
-
["Cash Flow Metric", "Value"],
|
| 464 |
-
["Operating Cash Flow", f"${data.raw_data.cash_flow.operating_cash_flow:,.2f}"],
|
| 465 |
-
["Investing Cash Flow", f"${data.raw_data.cash_flow.investing_cash_flow:,.2f}"],
|
| 466 |
-
["Financing Cash Flow", f"${data.raw_data.cash_flow.financing_cash_flow:,.2f}"],
|
| 467 |
-
["Net Change in Cash", f"${data.raw_data.cash_flow.net_change_in_cash:,.2f}"],
|
| 468 |
-
]
|
| 469 |
-
t_cf = Table(cf_data, colWidths=[3.5*inch, 2.5*inch])
|
| 470 |
-
t_cf.setStyle(TableStyle([
|
| 471 |
-
('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
|
| 472 |
-
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 473 |
-
('TOPPADDING', (0,0), (-1,-1), 6),
|
| 474 |
-
('BOTTOMPADDING', (0,0), (-1,-1), 6),
|
| 475 |
-
]))
|
| 476 |
-
elements.append(t_cf)
|
| 477 |
-
elements.append(PageBreak())
|
| 478 |
-
|
| 479 |
-
# === PAGE 5: STRATEGIC INTELLIGENCE ===
|
| 480 |
-
elements.append(Paragraph("Strategic Intelligence", title_style))
|
| 481 |
-
if data.geo_analysis:
|
| 482 |
-
elements.append(Paragraph("Market Context", header_style))
|
| 483 |
-
elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.strategic_context), body_style))
|
| 484 |
-
|
| 485 |
-
elements.append(Paragraph("Competitors", header_style))
|
| 486 |
-
elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.competitor_analysis), body_style))
|
| 487 |
-
|
| 488 |
-
elements.append(Paragraph("Growth Strategy", header_style))
|
| 489 |
-
elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.marketing_strategy), body_style))
|
| 490 |
-
else:
|
| 491 |
-
elements.append(Paragraph("Strategic data unavailable.", body_style))
|
| 492 |
-
elements.append(PageBreak())
|
| 493 |
-
|
| 494 |
-
# === PAGE 6: AI CFO RECOMMENDATIONS ===
|
| 495 |
-
elements.append(Paragraph("Predictive Outlook & Recommendations", title_style))
|
| 496 |
-
|
| 497 |
-
if data.insights:
|
| 498 |
-
# Skip first 3 used in exec summary
|
| 499 |
-
for i, insight in enumerate(data.insights[3:], 1):
|
| 500 |
-
elements.append(Paragraph(f"Recommendation #{i}", header_style))
|
| 501 |
-
elements.append(Paragraph(PDFReporter._sanitize_content(insight), body_style))
|
| 502 |
-
|
| 503 |
-
elements.append(PageBreak())
|
| 504 |
-
|
| 505 |
-
# === PAGE 7: APPENDIX ===
|
| 506 |
-
elements.append(Paragraph("Appendix: Full Data", title_style))
|
| 507 |
-
elements.append(Paragraph("Raw data extraction log.", body_style))
|
| 508 |
-
elements.append(Paragraph(f"Generated by Visique Engine v2.1 on {datetime.now()}", tiny_meta))
|
| 509 |
-
|
| 510 |
-
# Use onFirstPage and onLaterPages to draw the white box
|
| 511 |
-
doc.build(elements, onFirstPage=PDFReporter.cover_template_header, onLaterPages=PDFReporter.cover_template_header)
|
| 512 |
-
packet.seek(0)
|
| 513 |
-
|
| 514 |
-
# 2. Overlay onto Template
|
| 515 |
-
try:
|
| 516 |
-
# Load Template
|
| 517 |
-
template_path = os.path.join(os.getcwd(), PDFReporter.TEMPLATE_PATH)
|
| 518 |
-
if not os.path.exists(template_path):
|
| 519 |
-
# Fallback if template missing - just save the raw pdf
|
| 520 |
-
with open(filename, "wb") as f:
|
| 521 |
-
f.write(packet.getbuffer())
|
| 522 |
-
return filename
|
| 523 |
-
|
| 524 |
-
template_pdf = PdfReader(template_path)
|
| 525 |
-
content_pdf = PdfReader(packet)
|
| 526 |
-
output_pdf = PdfWriter()
|
| 527 |
-
|
| 528 |
-
# For each page of content, adding it to the template page
|
| 529 |
-
# Note: If content has more pages than template, we reuse template page 0 (or last)
|
| 530 |
-
template_page = template_pdf.pages[0]
|
| 531 |
-
|
| 532 |
-
for page_num in range(len(content_pdf.pages)):
|
| 533 |
-
# CORRECT APPROACH:
|
| 534 |
-
# 1. Create a blank page of correct size
|
| 535 |
-
# 2. Merge template (background)
|
| 536 |
-
# 3. Merge content (foreground)
|
| 537 |
-
|
| 538 |
-
# Get dimensions from template
|
| 539 |
-
width = template_page.mediabox.width
|
| 540 |
-
height = template_page.mediabox.height
|
| 541 |
-
|
| 542 |
-
# Create base page
|
| 543 |
-
output_page = PageObject.create_blank_page(width=width, height=height)
|
| 544 |
-
|
| 545 |
-
# Merge template onto it
|
| 546 |
-
output_page.merge_page(template_page)
|
| 547 |
-
|
| 548 |
-
# Merge generated content onto it
|
| 549 |
-
output_page.merge_page(content_pdf.pages[page_num])
|
| 550 |
-
|
| 551 |
-
# Add to output
|
| 552 |
-
output_pdf.add_page(output_page)
|
| 553 |
-
|
| 554 |
-
with open(filename, "wb") as f:
|
| 555 |
-
output_pdf.write(f)
|
| 556 |
-
|
| 557 |
-
except Exception as e:
|
| 558 |
-
print(f"Error merging template: {e}")
|
| 559 |
-
# Fallback to saving raw content
|
| 560 |
-
with open(filename, "wb") as f:
|
| 561 |
-
f.write(packet.getbuffer())
|
| 562 |
-
|
| 563 |
-
return filename
|
| 564 |
-
|
| 565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/services/reporting/pptx_report.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
from pptx import Presentation
|
| 2 |
-
from pptx.util import Inches, Pt
|
| 3 |
-
from pptx.dml.color import RGBColor
|
| 4 |
-
from app.schemas.financial import StandardizedDataPackage
|
| 5 |
-
|
| 6 |
-
class PPTXReporter:
|
| 7 |
-
@staticmethod
|
| 8 |
-
def generate(data: StandardizedDataPackage, filename: str):
|
| 9 |
-
prs = Presentation()
|
| 10 |
-
|
| 11 |
-
# Slide 1: Title Slide
|
| 12 |
-
slide_layout = prs.slide_layouts[0] # Title Slide
|
| 13 |
-
slide = prs.slides.add_slide(slide_layout)
|
| 14 |
-
title = slide.shapes.title
|
| 15 |
-
subtitle = slide.placeholders[1]
|
| 16 |
-
|
| 17 |
-
title.text = f"Financial Analysis: {data.raw_data.company_name}"
|
| 18 |
-
subtitle.text = f"Risk Score: {data.risk_analysis.risk_score} | Visique AI"
|
| 19 |
-
|
| 20 |
-
# Slide 2: Key Metrics
|
| 21 |
-
slide_layout = prs.slide_layouts[1] # Title and Content
|
| 22 |
-
slide = prs.slides.add_slide(slide_layout)
|
| 23 |
-
title = slide.shapes.title
|
| 24 |
-
title.text = "Key Financial Metrics"
|
| 25 |
-
|
| 26 |
-
content = slide.placeholders[1]
|
| 27 |
-
text_frame = content.text_frame
|
| 28 |
-
|
| 29 |
-
p = text_frame.add_paragraph()
|
| 30 |
-
p.text = f"Revenue: ${data.raw_data.income_statement.revenue:,}"
|
| 31 |
-
p.level = 0
|
| 32 |
-
|
| 33 |
-
p = text_frame.add_paragraph()
|
| 34 |
-
p.text = f"Net Margin: {data.kpis.net_margin}%" if data.kpis.net_margin else "Net Margin: N/A"
|
| 35 |
-
p.level = 0
|
| 36 |
-
|
| 37 |
-
p = text_frame.add_paragraph()
|
| 38 |
-
p.text = f"Solvency Risk: {data.risk_analysis.solvency_risk}"
|
| 39 |
-
p.level = 0
|
| 40 |
-
|
| 41 |
-
# Slide 3: Insights & Pain Points
|
| 42 |
-
slide = prs.slides.add_slide(slide_layout)
|
| 43 |
-
title = slide.shapes.title
|
| 44 |
-
title.text = "AI Insights & Pain Points"
|
| 45 |
-
|
| 46 |
-
content = slide.placeholders[1]
|
| 47 |
-
text_frame = content.text_frame
|
| 48 |
-
|
| 49 |
-
for insight in data.insights:
|
| 50 |
-
p = text_frame.add_paragraph()
|
| 51 |
-
p.text = insight
|
| 52 |
-
p.level = 0
|
| 53 |
-
if "Pain Point" in insight:
|
| 54 |
-
p.font.color.rgb = RGBColor(255, 0, 0)
|
| 55 |
-
|
| 56 |
-
prs.save(filename)
|
| 57 |
-
return filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/requirements.txt
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn[standard]
|
| 3 |
-
python-multipart
|
| 4 |
-
pandas
|
| 5 |
-
numpy
|
| 6 |
-
pydantic[email]
|
| 7 |
-
pydantic-settings
|
| 8 |
-
sqlalchemy
|
| 9 |
-
alembic
|
| 10 |
-
psycopg2-binary
|
| 11 |
-
cryptography
|
| 12 |
-
python-jose[cryptography]
|
| 13 |
-
passlib[bcrypt]
|
| 14 |
-
openpyxl
|
| 15 |
-
pdfminer.six==20231228
|
| 16 |
-
pdfplumber==0.10.3
|
| 17 |
-
reportlab
|
| 18 |
-
python-pptx
|
| 19 |
-
pypdf
|
| 20 |
-
stripe
|
| 21 |
-
email-validator
|
| 22 |
-
argon2-cffi
|
| 23 |
-
httpx
|
| 24 |
-
# Dolphin PDF Extraction (hybrid parser)
|
| 25 |
-
torch>=2.0.0
|
| 26 |
-
transformers>=4.40.0
|
| 27 |
-
huggingface-hub
|
| 28 |
-
Pillow
|
| 29 |
-
pdf2image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dolphin/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Standalone Dolphin module for the AI Worker.
|
| 3 |
+
No backend dependencies — everything needed is self-contained here.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
DEFAULT_MODEL_ID = "ByteDance/Dolphin"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _detect_device() -> str:
|
| 15 |
+
"""Auto-detect best available compute device."""
|
| 16 |
+
try:
|
| 17 |
+
import torch
|
| 18 |
+
if torch.cuda.is_available():
|
| 19 |
+
return "cuda"
|
| 20 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 21 |
+
return "mps"
|
| 22 |
+
except ImportError:
|
| 23 |
+
pass
|
| 24 |
+
return "cpu"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_device() -> str:
|
| 28 |
+
"""Get device from env or auto-detect."""
|
| 29 |
+
device = os.getenv("DOLPHIN_DEVICE", "auto")
|
| 30 |
+
if device != "auto":
|
| 31 |
+
return device
|
| 32 |
+
return _detect_device()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_model_path() -> str:
|
| 36 |
+
"""Get model path from env or default."""
|
| 37 |
+
return os.getenv("DOLPHIN_MODEL_PATH", DEFAULT_MODEL_ID)
|
{backend/app/services/ingestion/dolphin → dolphin}/client.py
RENAMED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
Dolphin Client —
|
| 3 |
-
|
| 4 |
-
Provides page-level, element-level, and layout parsing capabilities
|
| 5 |
-
with automatic device selection (CUDA > MPS > CPU).
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
|
@@ -20,9 +18,9 @@ logger = logging.getLogger(__name__)
|
|
| 20 |
@dataclass
|
| 21 |
class DolphinElement:
|
| 22 |
"""A single parsed element from a document page."""
|
| 23 |
-
element_type: str
|
| 24 |
-
content: str
|
| 25 |
-
bbox: Optional[List[float]] = None
|
| 26 |
confidence: float = 1.0
|
| 27 |
page_number: int = 0
|
| 28 |
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
@@ -32,7 +30,7 @@ class DolphinElement:
|
|
| 32 |
class DolphinPageResult:
|
| 33 |
"""Result from page-level parsing."""
|
| 34 |
page_number: int
|
| 35 |
-
markdown: str
|
| 36 |
structured_json: Dict[str, Any] = field(default_factory=dict)
|
| 37 |
elements: List[DolphinElement] = field(default_factory=list)
|
| 38 |
|
|
@@ -41,9 +39,9 @@ class DolphinPageResult:
|
|
| 41 |
class DolphinLayoutResult:
|
| 42 |
"""Result from layout analysis."""
|
| 43 |
page_number: int
|
| 44 |
-
sections: List[Dict[str, Any]] = field(default_factory=list)
|
| 45 |
-
reading_order: List[int] = field(default_factory=list)
|
| 46 |
-
doc_type_hint: str = "unknown"
|
| 47 |
|
| 48 |
|
| 49 |
@dataclass
|
|
@@ -58,49 +56,24 @@ class DolphinDocumentResult:
|
|
| 58 |
class DolphinClient:
|
| 59 |
"""
|
| 60 |
High-level client for Dolphin-v2 document parsing.
|
| 61 |
-
|
| 62 |
-
Acts as a factory: returns either a local model wrapper (if no API URL)
|
| 63 |
-
or a remote client (if API URL is configured).
|
| 64 |
"""
|
| 65 |
|
| 66 |
-
@staticmethod
|
| 67 |
-
def create():
|
| 68 |
-
"""
|
| 69 |
-
Factory method to create the appropriate Dolphin client.
|
| 70 |
-
|
| 71 |
-
Returns:
|
| 72 |
-
RemoteDolphinClient if DOLPHIN_API_URL is set
|
| 73 |
-
LocalDolphinClient (self) otherwise
|
| 74 |
-
"""
|
| 75 |
-
from app.core.config import settings
|
| 76 |
-
|
| 77 |
-
if settings.DOLPHIN_API_URL:
|
| 78 |
-
from app.services.ingestion.dolphin.remote_client import RemoteDolphinClient
|
| 79 |
-
return RemoteDolphinClient()
|
| 80 |
-
|
| 81 |
-
return DolphinClient()
|
| 82 |
-
|
| 83 |
def __init__(
|
| 84 |
self,
|
| 85 |
model_path: Optional[str] = None,
|
| 86 |
device: Optional[str] = None,
|
| 87 |
max_batch_size: int = 4,
|
| 88 |
):
|
| 89 |
-
from
|
| 90 |
|
| 91 |
-
self.model_path = model_path or
|
| 92 |
self.device = device or get_device()
|
| 93 |
self.max_batch_size = max_batch_size
|
| 94 |
self._model = None
|
| 95 |
self._processor = None
|
| 96 |
|
| 97 |
-
logger.info(
|
| 98 |
-
f"DolphinClient initialized: model={self.model_path}, device={self.device}"
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
# ------------------------------------------------------------------
|
| 102 |
-
# Lazy model loading
|
| 103 |
-
# ------------------------------------------------------------------
|
| 104 |
|
| 105 |
def _ensure_loaded(self):
|
| 106 |
"""Lazy-load model and processor on first use."""
|
|
@@ -111,7 +84,7 @@ class DolphinClient:
|
|
| 111 |
import torch
|
| 112 |
from transformers import AutoModelForVision2Seq, AutoProcessor
|
| 113 |
|
| 114 |
-
logger.info(f"Loading Dolphin
|
| 115 |
|
| 116 |
self._processor = AutoProcessor.from_pretrained(
|
| 117 |
self.model_path, trust_remote_code=True
|
|
@@ -124,31 +97,25 @@ class DolphinClient:
|
|
| 124 |
self._model.to(self.device)
|
| 125 |
self._model.eval()
|
| 126 |
|
| 127 |
-
logger.info("Dolphin
|
| 128 |
|
| 129 |
except Exception as e:
|
| 130 |
logger.error(f"Failed to load Dolphin model: {e}")
|
| 131 |
raise RuntimeError(f"Dolphin model loading failed: {e}") from e
|
| 132 |
|
| 133 |
-
# ------------------------------------------------------------------
|
| 134 |
-
# PDF → Images conversion
|
| 135 |
-
# ------------------------------------------------------------------
|
| 136 |
-
|
| 137 |
@staticmethod
|
| 138 |
def _pdf_to_images(pdf_path: str) -> list:
|
| 139 |
-
"""Convert PDF pages to PIL Images
|
| 140 |
try:
|
| 141 |
from pdf2image import convert_from_path
|
| 142 |
-
|
| 143 |
-
return images
|
| 144 |
except ImportError:
|
| 145 |
-
|
| 146 |
-
logger.warning("pdf2image not installed, using fallback renderer")
|
| 147 |
return DolphinClient._pdf_to_images_fallback(pdf_path)
|
| 148 |
|
| 149 |
@staticmethod
|
| 150 |
def _pdf_to_images_fallback(pdf_path: str) -> list:
|
| 151 |
-
"""Fallback PDF
|
| 152 |
from PIL import Image
|
| 153 |
import io
|
| 154 |
|
|
@@ -157,61 +124,30 @@ class DolphinClient:
|
|
| 157 |
reader = PdfReader(pdf_path)
|
| 158 |
images = []
|
| 159 |
for page in reader.pages:
|
| 160 |
-
# Extract any embedded images from the page
|
| 161 |
for img_key in page.images:
|
| 162 |
-
|
| 163 |
-
img = Image.open(io.BytesIO(img_data))
|
| 164 |
images.append(img)
|
| 165 |
-
break
|
| 166 |
if not images:
|
| 167 |
-
# Create a blank placeholder if no images could be extracted
|
| 168 |
-
logger.warning("No images extracted from PDF pages, layout analysis may be limited")
|
| 169 |
for _ in reader.pages:
|
| 170 |
-
|
| 171 |
-
images.append(img)
|
| 172 |
return images
|
| 173 |
except Exception as e:
|
| 174 |
logger.error(f"Fallback PDF image conversion failed: {e}")
|
| 175 |
return []
|
| 176 |
|
| 177 |
-
# ------------------------------------------------------------------
|
| 178 |
-
# Core parsing methods
|
| 179 |
-
# ------------------------------------------------------------------
|
| 180 |
-
|
| 181 |
def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
|
| 182 |
-
"""
|
| 183 |
-
Parse a single page image into structured output.
|
| 184 |
-
|
| 185 |
-
Args:
|
| 186 |
-
image: PIL Image of the page
|
| 187 |
-
page_number: Page index (0-based)
|
| 188 |
-
|
| 189 |
-
Returns:
|
| 190 |
-
DolphinPageResult with markdown and structured elements
|
| 191 |
-
"""
|
| 192 |
self._ensure_loaded()
|
| 193 |
-
|
| 194 |
try:
|
| 195 |
import torch
|
| 196 |
-
|
| 197 |
-
# Prepare input with page-level prompt
|
| 198 |
prompt = "<page_parsing>"
|
| 199 |
-
inputs = self._processor(
|
| 200 |
-
images=image, text=prompt, return_tensors="pt"
|
| 201 |
-
).to(self.device)
|
| 202 |
|
| 203 |
with torch.no_grad():
|
| 204 |
-
outputs = self._model.generate(
|
| 205 |
-
**inputs,
|
| 206 |
-
max_new_tokens=4096,
|
| 207 |
-
do_sample=False,
|
| 208 |
-
)
|
| 209 |
-
|
| 210 |
-
result_text = self._processor.batch_decode(
|
| 211 |
-
outputs, skip_special_tokens=True
|
| 212 |
-
)[0]
|
| 213 |
|
| 214 |
-
|
| 215 |
elements = self._parse_elements_from_text(result_text, page_number)
|
| 216 |
|
| 217 |
return DolphinPageResult(
|
|
@@ -220,86 +156,47 @@ class DolphinClient:
|
|
| 220 |
structured_json={"raw_output": result_text},
|
| 221 |
elements=elements,
|
| 222 |
)
|
| 223 |
-
|
| 224 |
except Exception as e:
|
| 225 |
logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
|
| 226 |
-
return DolphinPageResult(
|
| 227 |
-
page_number=page_number,
|
| 228 |
-
markdown="",
|
| 229 |
-
elements=[],
|
| 230 |
-
)
|
| 231 |
|
| 232 |
def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
|
| 233 |
-
"""
|
| 234 |
-
Analyze layout/structure of a page image.
|
| 235 |
-
|
| 236 |
-
Returns section bounding boxes, reading order, and document type hint.
|
| 237 |
-
"""
|
| 238 |
self._ensure_loaded()
|
| 239 |
-
|
| 240 |
try:
|
| 241 |
import torch
|
| 242 |
-
|
| 243 |
prompt = "<layout_parsing>"
|
| 244 |
-
inputs = self._processor(
|
| 245 |
-
images=image, text=prompt, return_tensors="pt"
|
| 246 |
-
).to(self.device)
|
| 247 |
|
| 248 |
with torch.no_grad():
|
| 249 |
-
outputs = self._model.generate(
|
| 250 |
-
**inputs,
|
| 251 |
-
max_new_tokens=2048,
|
| 252 |
-
do_sample=False,
|
| 253 |
-
)
|
| 254 |
-
|
| 255 |
-
result_text = self._processor.batch_decode(
|
| 256 |
-
outputs, skip_special_tokens=True
|
| 257 |
-
)[0]
|
| 258 |
|
|
|
|
| 259 |
sections = self._parse_layout_sections(result_text)
|
| 260 |
-
doc_type_hint = "digital" # Dolphin detects this in stage 1
|
| 261 |
|
| 262 |
return DolphinLayoutResult(
|
| 263 |
page_number=page_number,
|
| 264 |
sections=sections,
|
| 265 |
reading_order=list(range(len(sections))),
|
| 266 |
-
doc_type_hint=
|
| 267 |
)
|
| 268 |
-
|
| 269 |
except Exception as e:
|
| 270 |
logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
|
| 271 |
return DolphinLayoutResult(page_number=page_number)
|
| 272 |
|
| 273 |
def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
|
| 274 |
-
"""
|
| 275 |
-
Parse an entire PDF document — page-level + layout for all pages.
|
| 276 |
-
|
| 277 |
-
This is the main entry point for the hybrid parser.
|
| 278 |
-
|
| 279 |
-
Args:
|
| 280 |
-
pdf_path: Path to the PDF file
|
| 281 |
-
|
| 282 |
-
Returns:
|
| 283 |
-
DolphinDocumentResult with all pages parsed
|
| 284 |
-
"""
|
| 285 |
images = self._pdf_to_images(pdf_path)
|
| 286 |
if not images:
|
| 287 |
-
logger.warning(f"No page images extracted from {pdf_path}")
|
| 288 |
return DolphinDocumentResult(total_pages=0)
|
| 289 |
|
| 290 |
-
pages = []
|
| 291 |
-
layouts = []
|
| 292 |
-
all_markdown = []
|
| 293 |
|
| 294 |
for i, image in enumerate(images):
|
| 295 |
logger.debug(f"Parsing page {i + 1}/{len(images)}")
|
| 296 |
-
|
| 297 |
-
# Page-level parsing (structured content)
|
| 298 |
page_result = self.parse_page(image, page_number=i)
|
| 299 |
pages.append(page_result)
|
| 300 |
all_markdown.append(page_result.markdown)
|
| 301 |
-
|
| 302 |
-
# Layout analysis (structure detection)
|
| 303 |
layout_result = self.parse_layout(image, page_number=i)
|
| 304 |
layouts.append(layout_result)
|
| 305 |
|
|
@@ -310,50 +207,27 @@ class DolphinClient:
|
|
| 310 |
total_pages=len(images),
|
| 311 |
)
|
| 312 |
|
| 313 |
-
# ------------------------------------------------------------------
|
| 314 |
-
# Internal helpers
|
| 315 |
-
# ------------------------------------------------------------------
|
| 316 |
-
|
| 317 |
@staticmethod
|
| 318 |
def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
|
| 319 |
-
"""Parse Dolphin's text output into structured
|
| 320 |
elements = []
|
| 321 |
if not text:
|
| 322 |
return elements
|
| 323 |
|
| 324 |
import re
|
| 325 |
-
|
| 326 |
-
# Split by Markdown table blocks
|
| 327 |
table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
|
| 328 |
-
|
| 329 |
last_end = 0
|
| 330 |
for match in table_pattern.finditer(text):
|
| 331 |
-
# Text before table
|
| 332 |
pre_text = text[last_end:match.start()].strip()
|
| 333 |
if pre_text:
|
| 334 |
-
elements.append(DolphinElement(
|
| 335 |
-
|
| 336 |
-
content=pre_text,
|
| 337 |
-
page_number=page_number,
|
| 338 |
-
))
|
| 339 |
-
|
| 340 |
-
# Table element
|
| 341 |
-
elements.append(DolphinElement(
|
| 342 |
-
element_type="table",
|
| 343 |
-
content=match.group(0),
|
| 344 |
-
page_number=page_number,
|
| 345 |
-
))
|
| 346 |
last_end = match.end()
|
| 347 |
|
| 348 |
-
# Remaining text after last table
|
| 349 |
remaining = text[last_end:].strip()
|
| 350 |
if remaining:
|
| 351 |
-
elements.append(DolphinElement(
|
| 352 |
-
element_type="text",
|
| 353 |
-
content=remaining,
|
| 354 |
-
page_number=page_number,
|
| 355 |
-
))
|
| 356 |
-
|
| 357 |
return elements
|
| 358 |
|
| 359 |
@staticmethod
|
|
@@ -364,29 +238,17 @@ class DolphinClient:
|
|
| 364 |
return sections
|
| 365 |
|
| 366 |
import re
|
| 367 |
-
|
| 368 |
-
# Dolphin layout output typically contains bounding box coordinates
|
| 369 |
-
# Pattern: <section_type> [x1, y1, x2, y2]
|
| 370 |
-
bbox_pattern = re.compile(
|
| 371 |
-
r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
|
| 372 |
-
)
|
| 373 |
|
| 374 |
for match in bbox_pattern.finditer(text):
|
| 375 |
sections.append({
|
| 376 |
"type": match.group(1).strip(),
|
| 377 |
-
"bbox": [
|
| 378 |
-
int(match.group(2)),
|
| 379 |
-
int(match.group(3)),
|
| 380 |
-
int(match.group(4)),
|
| 381 |
-
int(match.group(5)),
|
| 382 |
-
],
|
| 383 |
})
|
| 384 |
|
| 385 |
-
# If no bbox patterns found, treat each line as a section label
|
| 386 |
if not sections:
|
| 387 |
for line in text.strip().split("\n"):
|
| 388 |
line = line.strip()
|
| 389 |
if line:
|
| 390 |
sections.append({"type": line, "bbox": []})
|
| 391 |
-
|
| 392 |
return sections
|
|
|
|
| 1 |
"""
|
| 2 |
+
Dolphin Client — Standalone version for the AI Worker.
|
| 3 |
+
No backend dependencies.
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
|
|
|
| 18 |
@dataclass
|
| 19 |
class DolphinElement:
|
| 20 |
"""A single parsed element from a document page."""
|
| 21 |
+
element_type: str
|
| 22 |
+
content: str
|
| 23 |
+
bbox: Optional[List[float]] = None
|
| 24 |
confidence: float = 1.0
|
| 25 |
page_number: int = 0
|
| 26 |
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
| 30 |
class DolphinPageResult:
|
| 31 |
"""Result from page-level parsing."""
|
| 32 |
page_number: int
|
| 33 |
+
markdown: str
|
| 34 |
structured_json: Dict[str, Any] = field(default_factory=dict)
|
| 35 |
elements: List[DolphinElement] = field(default_factory=list)
|
| 36 |
|
|
|
|
| 39 |
class DolphinLayoutResult:
|
| 40 |
"""Result from layout analysis."""
|
| 41 |
page_number: int
|
| 42 |
+
sections: List[Dict[str, Any]] = field(default_factory=list)
|
| 43 |
+
reading_order: List[int] = field(default_factory=list)
|
| 44 |
+
doc_type_hint: str = "unknown"
|
| 45 |
|
| 46 |
|
| 47 |
@dataclass
|
|
|
|
| 56 |
class DolphinClient:
|
| 57 |
"""
|
| 58 |
High-level client for Dolphin-v2 document parsing.
|
| 59 |
+
Standalone version — no backend package dependencies.
|
|
|
|
|
|
|
| 60 |
"""
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
def __init__(
|
| 63 |
self,
|
| 64 |
model_path: Optional[str] = None,
|
| 65 |
device: Optional[str] = None,
|
| 66 |
max_batch_size: int = 4,
|
| 67 |
):
|
| 68 |
+
from dolphin import get_model_path, get_device
|
| 69 |
|
| 70 |
+
self.model_path = model_path or get_model_path()
|
| 71 |
self.device = device or get_device()
|
| 72 |
self.max_batch_size = max_batch_size
|
| 73 |
self._model = None
|
| 74 |
self._processor = None
|
| 75 |
|
| 76 |
+
logger.info(f"DolphinClient initialized: model={self.model_path}, device={self.device}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
def _ensure_loaded(self):
|
| 79 |
"""Lazy-load model and processor on first use."""
|
|
|
|
| 84 |
import torch
|
| 85 |
from transformers import AutoModelForVision2Seq, AutoProcessor
|
| 86 |
|
| 87 |
+
logger.info(f"Loading Dolphin model from {self.model_path}...")
|
| 88 |
|
| 89 |
self._processor = AutoProcessor.from_pretrained(
|
| 90 |
self.model_path, trust_remote_code=True
|
|
|
|
| 97 |
self._model.to(self.device)
|
| 98 |
self._model.eval()
|
| 99 |
|
| 100 |
+
logger.info("Dolphin model loaded successfully")
|
| 101 |
|
| 102 |
except Exception as e:
|
| 103 |
logger.error(f"Failed to load Dolphin model: {e}")
|
| 104 |
raise RuntimeError(f"Dolphin model loading failed: {e}") from e
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
@staticmethod
|
| 107 |
def _pdf_to_images(pdf_path: str) -> list:
|
| 108 |
+
"""Convert PDF pages to PIL Images."""
|
| 109 |
try:
|
| 110 |
from pdf2image import convert_from_path
|
| 111 |
+
return convert_from_path(pdf_path, dpi=200)
|
|
|
|
| 112 |
except ImportError:
|
| 113 |
+
logger.warning("pdf2image not installed, using fallback")
|
|
|
|
| 114 |
return DolphinClient._pdf_to_images_fallback(pdf_path)
|
| 115 |
|
| 116 |
@staticmethod
|
| 117 |
def _pdf_to_images_fallback(pdf_path: str) -> list:
|
| 118 |
+
"""Fallback PDF to image conversion."""
|
| 119 |
from PIL import Image
|
| 120 |
import io
|
| 121 |
|
|
|
|
| 124 |
reader = PdfReader(pdf_path)
|
| 125 |
images = []
|
| 126 |
for page in reader.pages:
|
|
|
|
| 127 |
for img_key in page.images:
|
| 128 |
+
img = Image.open(io.BytesIO(img_key.data))
|
|
|
|
| 129 |
images.append(img)
|
| 130 |
+
break
|
| 131 |
if not images:
|
|
|
|
|
|
|
| 132 |
for _ in reader.pages:
|
| 133 |
+
images.append(Image.new("RGB", (1700, 2200), "white"))
|
|
|
|
| 134 |
return images
|
| 135 |
except Exception as e:
|
| 136 |
logger.error(f"Fallback PDF image conversion failed: {e}")
|
| 137 |
return []
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
|
| 140 |
+
"""Parse a single page image into structured output."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
self._ensure_loaded()
|
|
|
|
| 142 |
try:
|
| 143 |
import torch
|
|
|
|
|
|
|
| 144 |
prompt = "<page_parsing>"
|
| 145 |
+
inputs = self._processor(images=image, text=prompt, return_tensors="pt").to(self.device)
|
|
|
|
|
|
|
| 146 |
|
| 147 |
with torch.no_grad():
|
| 148 |
+
outputs = self._model.generate(**inputs, max_new_tokens=4096, do_sample=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
+
result_text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 151 |
elements = self._parse_elements_from_text(result_text, page_number)
|
| 152 |
|
| 153 |
return DolphinPageResult(
|
|
|
|
| 156 |
structured_json={"raw_output": result_text},
|
| 157 |
elements=elements,
|
| 158 |
)
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
|
| 161 |
+
return DolphinPageResult(page_number=page_number, markdown="", elements=[])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
|
| 164 |
+
"""Analyze layout/structure of a page image."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
self._ensure_loaded()
|
|
|
|
| 166 |
try:
|
| 167 |
import torch
|
|
|
|
| 168 |
prompt = "<layout_parsing>"
|
| 169 |
+
inputs = self._processor(images=image, text=prompt, return_tensors="pt").to(self.device)
|
|
|
|
|
|
|
| 170 |
|
| 171 |
with torch.no_grad():
|
| 172 |
+
outputs = self._model.generate(**inputs, max_new_tokens=2048, do_sample=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
result_text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 175 |
sections = self._parse_layout_sections(result_text)
|
|
|
|
| 176 |
|
| 177 |
return DolphinLayoutResult(
|
| 178 |
page_number=page_number,
|
| 179 |
sections=sections,
|
| 180 |
reading_order=list(range(len(sections))),
|
| 181 |
+
doc_type_hint="digital",
|
| 182 |
)
|
|
|
|
| 183 |
except Exception as e:
|
| 184 |
logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
|
| 185 |
return DolphinLayoutResult(page_number=page_number)
|
| 186 |
|
| 187 |
def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
|
| 188 |
+
"""Parse an entire PDF document."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
images = self._pdf_to_images(pdf_path)
|
| 190 |
if not images:
|
|
|
|
| 191 |
return DolphinDocumentResult(total_pages=0)
|
| 192 |
|
| 193 |
+
pages, layouts, all_markdown = [], [], []
|
|
|
|
|
|
|
| 194 |
|
| 195 |
for i, image in enumerate(images):
|
| 196 |
logger.debug(f"Parsing page {i + 1}/{len(images)}")
|
|
|
|
|
|
|
| 197 |
page_result = self.parse_page(image, page_number=i)
|
| 198 |
pages.append(page_result)
|
| 199 |
all_markdown.append(page_result.markdown)
|
|
|
|
|
|
|
| 200 |
layout_result = self.parse_layout(image, page_number=i)
|
| 201 |
layouts.append(layout_result)
|
| 202 |
|
|
|
|
| 207 |
total_pages=len(images),
|
| 208 |
)
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
@staticmethod
|
| 211 |
def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
|
| 212 |
+
"""Parse Dolphin's text output into structured elements."""
|
| 213 |
elements = []
|
| 214 |
if not text:
|
| 215 |
return elements
|
| 216 |
|
| 217 |
import re
|
|
|
|
|
|
|
| 218 |
table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
|
| 219 |
+
|
| 220 |
last_end = 0
|
| 221 |
for match in table_pattern.finditer(text):
|
|
|
|
| 222 |
pre_text = text[last_end:match.start()].strip()
|
| 223 |
if pre_text:
|
| 224 |
+
elements.append(DolphinElement(element_type="text", content=pre_text, page_number=page_number))
|
| 225 |
+
elements.append(DolphinElement(element_type="table", content=match.group(0), page_number=page_number))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
last_end = match.end()
|
| 227 |
|
|
|
|
| 228 |
remaining = text[last_end:].strip()
|
| 229 |
if remaining:
|
| 230 |
+
elements.append(DolphinElement(element_type="text", content=remaining, page_number=page_number))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
return elements
|
| 232 |
|
| 233 |
@staticmethod
|
|
|
|
| 238 |
return sections
|
| 239 |
|
| 240 |
import re
|
| 241 |
+
bbox_pattern = re.compile(r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
for match in bbox_pattern.finditer(text):
|
| 244 |
sections.append({
|
| 245 |
"type": match.group(1).strip(),
|
| 246 |
+
"bbox": [int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5))],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
})
|
| 248 |
|
|
|
|
| 249 |
if not sections:
|
| 250 |
for line in text.strip().split("\n"):
|
| 251 |
line = line.strip()
|
| 252 |
if line:
|
| 253 |
sections.append({"type": line, "bbox": []})
|
|
|
|
| 254 |
return sections
|