Happy People commited on
Commit
8b33e12
·
1 Parent(s): 7470785

Standalone worker: zero backend dependencies

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +13 -28
  2. backend/app/__init__.py +0 -98
  3. backend/app/api/admin.py +0 -433
  4. backend/app/api/auth.py +0 -221
  5. backend/app/api/endpoints.py +0 -742
  6. backend/app/core/__init__.py +0 -52
  7. backend/app/core/config.py +0 -59
  8. backend/app/core/database.py +0 -26
  9. backend/app/core/feature_registry.py +0 -255
  10. backend/app/core/migrations.py +0 -111
  11. backend/app/core/plan_config.py +0 -192
  12. backend/app/core/security.py +0 -28
  13. backend/app/core/stripe_config.py +0 -29
  14. backend/app/main.py +0 -124
  15. backend/app/models/feature_flags.py +0 -59
  16. backend/app/models/user.py +0 -63
  17. backend/app/schemas/chat.py +0 -14
  18. backend/app/schemas/financial.py +0 -47
  19. backend/app/schemas/user.py +0 -82
  20. backend/app/services/__init__.py +0 -37
  21. backend/app/services/analysis/__init__.py +0 -54
  22. backend/app/services/analysis/engine_lite.py +0 -48
  23. backend/app/services/analysis/factory.py +0 -18
  24. backend/app/services/analysis/fundamental.py +0 -75
  25. backend/app/services/analysis/growth.py +0 -26
  26. backend/app/services/analysis/health_score.py +0 -46
  27. backend/app/services/analysis/kpi.py +0 -56
  28. backend/app/services/analysis/risk.py +0 -57
  29. backend/app/services/analysis/simulation.py +0 -67
  30. backend/app/services/feature_service.py +0 -306
  31. backend/app/services/ingestion/__init__.py +0 -57
  32. backend/app/services/ingestion/dolphin/__init__.py +0 -158
  33. backend/app/services/ingestion/dolphin/classifier.py +0 -288
  34. backend/app/services/ingestion/dolphin/extractor.py +0 -336
  35. backend/app/services/ingestion/dolphin/remote_client.py +0 -110
  36. backend/app/services/ingestion/mappings.py +0 -315
  37. backend/app/services/ingestion/parser_csv.py +0 -127
  38. backend/app/services/ingestion/parser_dolphin.py +0 -429
  39. backend/app/services/ingestion/parser_pdf.py +0 -402
  40. backend/app/services/ingestion/parser_xlsx.py +0 -312
  41. backend/app/services/ingestion/unified_parser.py +0 -84
  42. backend/app/services/intelligence/ai_cfo.py +0 -52
  43. backend/app/services/intelligence/gemini_service.py +0 -238
  44. backend/app/services/intelligence/geo_service.py +0 -104
  45. backend/app/services/intelligence/rag.py +0 -35
  46. backend/app/services/reporting/pdf_report.py +0 -565
  47. backend/app/services/reporting/pptx_report.py +0 -57
  48. backend/requirements.txt +0 -29
  49. dolphin/__init__.py +37 -0
  50. {backend/app/services/ingestion/dolphin → dolphin}/client.py +41 -179
Dockerfile CHANGED
@@ -1,38 +1,23 @@
1
- FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
2
 
3
- # Set environment
4
- ENV DEBIAN_FRONTEND=noninteractive
5
- ENV PYTHONUNBUFFERED=1
6
- ENV PATH="/home/user/.local/bin:$PATH"
7
-
8
- # Install system dependencies
9
- RUN apt-get update && apt-get install -y \
10
- python3.10 \
11
- python3-pip \
12
- python3-venv \
13
- poppler-utils \
14
- git \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # Create user (Hugging Face Spaces runs as user 1000)
18
  RUN useradd -m -u 1000 user
19
  USER user
20
- WORKDIR /home/user/app
21
 
22
- # Copy application code
23
- # We expect the `visique/backend` code to be copied into `backend/`
24
- # and `visique/ai-worker` code to be in `.`
25
- COPY --chown=user:user . .
26
 
27
- # Install Dependencies
28
- RUN pip3 install --no-cache-dir --upgrade pip && \
29
- pip3 install --no-cache-dir -r requirements.txt
30
 
31
- # Create models directory
32
- RUN mkdir -p /home/user/app/models/dolphin-v2
33
 
34
- # Expose port (HF Spaces defaults to 7860)
35
  EXPOSE 7860
36
-
37
- # CMD to copy backend lib and start app
38
- CMD ["/bin/bash", "-c", "cp -r backend/app . && uvicorn main:app --host 0.0.0.0 --port 7860"]
 
1
+ FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
2
 
3
+ # System deps
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ python3 python3-pip poppler-utils git \
 
 
 
 
 
 
 
 
 
6
  && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Create non-root user (required by HF Spaces)
9
  RUN useradd -m -u 1000 user
10
  USER user
11
+ ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
12
 
13
+ WORKDIR /home/user/app
 
 
 
14
 
15
+ # Install Python deps first (layer caching)
16
+ COPY --chown=user requirements.txt .
17
+ RUN pip install --no-cache-dir --user -r requirements.txt
18
 
19
+ # Copy application code
20
+ COPY --chown=user . .
21
 
 
22
  EXPOSE 7860
23
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
backend/app/__init__.py DELETED
@@ -1,98 +0,0 @@
1
- """
2
- Visique Backend Application
3
-
4
- This package contains the backend API and services for the Visique financial analysis platform.
5
-
6
- ## Architecture Overview
7
-
8
- ```
9
- app/
10
- ├── api/ # FastAPI route handlers
11
- │ ├── admin.py # Admin console endpoints (users, reports, features)
12
- │ ├── auth.py # Authentication (login, register, JWT)
13
- │ └── endpoints.py # Analysis endpoints (upload, simulate, report)
14
-
15
- ├── core/ # Core configuration and utilities
16
- │ ├── config.py # Environment settings (API keys, URLs)
17
- │ ├── database.py # SQLAlchemy database connection
18
- │ ├── security.py # JWT token creation/validation
19
- │ ├── feature_registry.py # Auto-discoverable feature definitions
20
- │ └── plan_config.py # Plan limits and default features
21
-
22
- ├── models/ # SQLAlchemy database models
23
- │ ├── user.py # User, Analysis, Payment models
24
- │ └── feature_flags.py # PlanFeatureOverride, PlanUploadLimit
25
-
26
- ├── schemas/ # Pydantic request/response schemas
27
- │ ├── user.py # UserCreate, UserResponse, etc.
28
- │ ├── financial.py # StandardizedDataPackage, KPIs, etc.
29
- │ └── chat.py # ChatRequest, ChatResponse
30
-
31
- ├── services/ # Business logic layer
32
- │ ├── feature_service.py # Feature flag resolution logic
33
- │ ├── analysis/ # Financial analysis modules
34
- │ │ ├── fundamental.py # Main analysis orchestrator
35
- │ │ ├── kpi.py # KPI calculations
36
- │ │ ├── risk.py # Risk analysis
37
- │ │ ├── health_score.py # Health score computation
38
- │ │ ├── growth.py # Growth metrics
39
- │ │ └── simulation.py # What-if scenario modeling
40
- │ ├── ingestion/ # Data parsing
41
- │ │ ├── parser_csv.py # CSV file parsing
42
- │ │ ├── parser_pdf.py # PDF extraction + OCR
43
- │ │ └── mappings.py # Field name normalization
44
- │ ├── intelligence/ # AI-powered features
45
- │ │ ├── gemini_service.py # Gemini API integration
46
- │ │ ├── ai_cfo.py # AI CFO chat functionality
47
- │ │ ├── geo_service.py # Geo-strategic analysis
48
- │ │ └── rag.py # RAG for document QA
49
- │ └── reporting/ # Report generation
50
- │ ├── pdf_report.py # PDF report builder
51
- │ └── pptx_report.py # PowerPoint builder
52
-
53
- └── main.py # FastAPI app initialization
54
- ```
55
-
56
- ## Module Responsibilities
57
-
58
- ### API Layer (`api/`)
59
- - HTTP request handling only
60
- - Input validation via Pydantic
61
- - Delegates all logic to services
62
- - Returns standardized responses
63
-
64
- ### Core Layer (`core/`)
65
- - Application-wide configuration
66
- - Feature registry (add new features here)
67
- - Plan configuration (modify limits here)
68
- - Security utilities (JWT)
69
-
70
- ### Models Layer (`models/`)
71
- - Database schema definitions
72
- - Relationships between entities
73
- - No business logic
74
-
75
- ### Schemas Layer (`schemas/`)
76
- - Request/response validation
77
- - Data transformation for API
78
- - Type hints for IDE support
79
-
80
- ### Services Layer (`services/`)
81
- - All business logic lives here
82
- - Each subdirectory is a domain
83
- - Services are stateless and testable
84
-
85
- ## Adding New Features
86
-
87
- 1. **New Feature Flag**: Add to `core/feature_registry.py`
88
- 2. **New API Endpoint**: Add to appropriate `api/*.py`
89
- 3. **New Service Logic**: Create in `services/` subdirectory
90
- 4. **New Model Field**: Add to `models/` and run migration
91
-
92
- ## Key Design Patterns
93
-
94
- - **Repository Pattern**: Services interact with DB via session
95
- - **Dependency Injection**: FastAPI `Depends()` for DB/auth
96
- - **Single Responsibility**: Each module has one clear purpose
97
- - **Feature Registry**: Auto-discoverable, category-organized
98
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/api/admin.py DELETED
@@ -1,433 +0,0 @@
1
- from fastapi import APIRouter, Depends, HTTPException, status
2
- from sqlalchemy.orm import Session
3
- from typing import List, Optional
4
- from app.core.database import get_db
5
- from app.models.user import User, Payment, Analysis
6
- from app.schemas.user import UserResponse, PaymentResponse
7
- from app.api.auth import get_current_user
8
- import os
9
-
10
- router = APIRouter(prefix="/admin", tags=["admin"])
11
-
12
- def get_current_admin(current_user: User = Depends(get_current_user)):
13
- if not current_user.is_admin:
14
- raise HTTPException(
15
- status_code=status.HTTP_403_FORBIDDEN,
16
- detail="The user doesn't have enough privileges",
17
- )
18
- return current_user
19
-
20
- @router.get("/payments", response_model=List[PaymentResponse])
21
- def read_all_payments(
22
- skip: int = 0,
23
- limit: int = 100,
24
- db: Session = Depends(get_db),
25
- current_user: User = Depends(get_current_admin)
26
- ):
27
- payments = db.query(Payment).offset(skip).limit(limit).all()
28
- return payments
29
-
30
- @router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
31
- def delete_user(
32
- user_id: int,
33
- db: Session = Depends(get_db),
34
- current_user: User = Depends(get_current_admin)
35
- ):
36
- user = db.query(User).filter(User.id == user_id).first()
37
- if not user:
38
- raise HTTPException(status_code=404, detail="User not found")
39
-
40
- if user.id == current_user.id:
41
- raise HTTPException(status_code=400, detail="Cannot delete your own admin account")
42
-
43
- db.delete(user)
44
- db.commit()
45
- return None
46
-
47
- from pydantic import BaseModel
48
- class AdminUserUpdate(BaseModel):
49
- full_name: Optional[str] = None
50
- company_name: Optional[str] = None
51
- plan: Optional[str] = None
52
- is_admin: Optional[bool] = None
53
- is_super_admin: Optional[bool] = None
54
- visique_id: Optional[str] = None
55
- ein: Optional[str] = None
56
- address: Optional[str] = None
57
- industry: Optional[str] = None
58
-
59
- class FeatureToggleRequest(BaseModel):
60
- feature_states: dict # {feature_id: bool}
61
-
62
- @router.put("/users/{user_id}", response_model=UserResponse)
63
- def update_user_admin(
64
- user_id: int,
65
- user_update: AdminUserUpdate,
66
- db: Session = Depends(get_db),
67
- current_user: User = Depends(get_current_admin)
68
- ):
69
- user = db.query(User).filter(User.id == user_id).first()
70
- if not user:
71
- raise HTTPException(status_code=404, detail="User not found")
72
-
73
- # Check if target is admin and requester is not super admin
74
- if user.is_admin and not current_user.is_super_admin:
75
- raise HTTPException(
76
- status_code=403,
77
- detail="Only Special Admins can edit Admin profiles"
78
- )
79
-
80
- update_data = user_update.dict(exclude_unset=True)
81
- for key, value in update_data.items():
82
- # Only super admins can change is_super_admin status
83
- if key == "is_super_admin" and not current_user.is_super_admin:
84
- continue
85
- setattr(user, key, value)
86
-
87
- db.commit()
88
- db.refresh(user)
89
- return user
90
-
91
-
92
- @router.put("/users/{user_id}/features")
93
- def update_user_features(
94
- user_id: int,
95
- request: FeatureToggleRequest,
96
- db: Session = Depends(get_db),
97
- current_user: User = Depends(get_current_admin)
98
- ):
99
- """
100
- Update custom feature overrides for a specific user.
101
- """
102
- user = db.query(User).filter(User.id == user_id).first()
103
- if not user:
104
- raise HTTPException(status_code=404, detail="User not found")
105
-
106
- # Get current and merge
107
- current_features = user.custom_features or {}
108
-
109
- # Handle SQLite parsing if needed
110
- if isinstance(current_features, str):
111
- import json
112
- try:
113
- current_features = json.loads(current_features)
114
- except:
115
- current_features = {}
116
-
117
- # Ensure it's a dict copy to trigger mutation detection
118
- new_features = dict(current_features)
119
-
120
- for k, v in request.feature_states.items():
121
- new_features[k] = v
122
-
123
- user.custom_features = new_features
124
-
125
- from sqlalchemy.orm.attributes import flag_modified
126
- flag_modified(user, "custom_features")
127
-
128
- db.commit()
129
- return {
130
- "status": "success",
131
- "user_id": user.id,
132
- "custom_features": user.custom_features
133
- }
134
-
135
-
136
- class EngineUpdateRequest(BaseModel):
137
- engine: str
138
-
139
- @router.put("/users/{user_id}/engine")
140
- def update_user_engine(
141
- user_id: int,
142
- request: EngineUpdateRequest,
143
- db: Session = Depends(get_db),
144
- current_user: User = Depends(get_current_admin)
145
- ):
146
- """
147
- Update a user's preferred engine (v1 or v2).
148
- """
149
- user = db.query(User).filter(User.id == user_id).first()
150
- if not user:
151
- raise HTTPException(status_code=404, detail="User not found")
152
-
153
- if request.engine not in ["v1", "v2"]:
154
- raise HTTPException(status_code=400, detail="Invalid engine. Must be 'v1' or 'v2'")
155
-
156
- user.preferred_engine = request.engine
157
- db.commit()
158
- db.refresh(user)
159
-
160
- return {"status": "success", "user_id": user.id, "preferred_engine": user.preferred_engine}
161
-
162
- @router.get("/users", response_model=List[UserResponse])
163
- def read_all_users(
164
- skip: int = 0,
165
- limit: int = 100,
166
- search: Optional[str] = None,
167
- db: Session = Depends(get_db),
168
- current_user: User = Depends(get_current_admin)
169
- ):
170
- query = db.query(User)
171
- if search:
172
- # Search by Visique ID (exact or partial) or Email or Name
173
- search_filter = f"%{search}%"
174
- query = query.filter(
175
- (User.email.ilike(search_filter)) |
176
- (User.full_name.ilike(search_filter)) |
177
- (User.visique_id.ilike(search_filter))
178
- )
179
- return query.offset(skip).limit(limit).all()
180
-
181
- @router.get("/analyses")
182
- def read_all_analyses(
183
- skip: int = 0,
184
- limit: int = 100,
185
- db: Session = Depends(get_db),
186
- current_user: User = Depends(get_current_admin)
187
- ):
188
- """
189
- Get all analyses from all users.
190
- Returns a simplified list for the admin dashboard.
191
- """
192
- # Join with User to get owner details
193
- analyses = db.query(Analysis).join(User).order_by(Analysis.timestamp.desc()).offset(skip).limit(limit).all()
194
-
195
- result = []
196
- for a in analyses:
197
- result.append({
198
- "id": a.id,
199
- "company_name": a.company_name,
200
- "filename": a.input_filename,
201
- "timestamp": a.timestamp,
202
- "owner_email": a.owner.email,
203
- "owner_visique_id": a.owner.visique_id
204
- })
205
- return result
206
-
207
- @router.delete("/analyses/{analysis_id}", status_code=status.HTTP_204_NO_CONTENT)
208
- def delete_analysis_admin(
209
- analysis_id: int,
210
- db: Session = Depends(get_db),
211
- current_user: User = Depends(get_current_admin)
212
- ):
213
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
214
- if not analysis:
215
- raise HTTPException(status_code=404, detail="Analysis not found")
216
-
217
- # Delete file from disk
218
- if analysis.stored_filename and os.path.exists(analysis.stored_filename):
219
- try:
220
- os.remove(analysis.stored_filename)
221
- except OSError:
222
- pass # Continue even if file delete fails
223
-
224
- db.delete(analysis)
225
- db.commit()
226
- return None
227
-
228
-
229
- # =============================================================================
230
- # USAGE TRACKING ENDPOINTS
231
- # =============================================================================
232
-
233
- @router.get("/usage")
234
- def get_usage_stats(
235
- db: Session = Depends(get_db),
236
- current_user: User = Depends(get_current_admin)
237
- ):
238
- """
239
- Get upload usage statistics for all users.
240
- Shows uploads used, limit, and percentage for admin dashboard.
241
- """
242
- from app.services.feature_service import get_effective_upload_limit
243
-
244
- users = db.query(User).all()
245
- result = []
246
-
247
- for user in users:
248
- plan = user.plan or "Individual"
249
- if user.is_admin:
250
- plan = "Admin"
251
-
252
- limit = get_effective_upload_limit(db, plan)
253
- used = user.monthly_upload_count or 0
254
- percentage = round((used / limit * 100), 1) if limit > 0 else 0
255
-
256
- result.append({
257
- "id": user.id,
258
- "email": user.email,
259
- "full_name": user.full_name,
260
- "visique_id": user.visique_id,
261
- "plan": plan,
262
- "uploads_used": used,
263
- "uploads_limit": limit,
264
- "usage_percentage": percentage,
265
- "reset_date": user.upload_reset_date.isoformat() if user.upload_reset_date else None
266
- })
267
-
268
- # Sort by usage percentage descending
269
- result.sort(key=lambda x: x["usage_percentage"], reverse=True)
270
- return result
271
-
272
-
273
- # =============================================================================
274
- # FEATURE FLAG ENDPOINTS
275
- # =============================================================================
276
-
277
- @router.get("/features")
278
- def get_feature_matrix(
279
- db: Session = Depends(get_db),
280
- current_user: User = Depends(get_current_admin)
281
- ):
282
- """
283
- Get the full feature matrix for admin console.
284
- Shows all features grouped by category with per-plan toggles.
285
- """
286
- from app.services.feature_service import get_feature_matrix as get_matrix
287
- return get_matrix(db)
288
-
289
-
290
- @router.get("/features/registry")
291
- def get_feature_registry(
292
- current_user: User = Depends(get_current_admin)
293
- ):
294
- """
295
- Get the feature registry - all available features.
296
- Useful for understanding what features can be controlled.
297
- """
298
- from app.core.feature_registry import get_features_by_category, get_all_feature_ids
299
-
300
- categories = get_features_by_category()
301
- result = {}
302
-
303
- for cat_name, features in categories.items():
304
- result[cat_name] = [
305
- {
306
- "id": f.id,
307
- "name": f.name,
308
- "description": f.description,
309
- "default_enabled": f.default_enabled
310
- }
311
- for f in features
312
- ]
313
-
314
- return {
315
- "total_features": len(get_all_feature_ids()),
316
- "categories": result
317
- }
318
-
319
-
320
- @router.get("/features/{plan_name}")
321
- def get_plan_features(
322
- plan_name: str,
323
- db: Session = Depends(get_db),
324
- current_user: User = Depends(get_current_admin)
325
- ):
326
- """
327
- Get enabled features for a specific plan.
328
- """
329
- from app.services.feature_service import get_effective_features, get_effective_upload_limit
330
- from app.core.plan_config import get_all_plans, get_all_engines
331
-
332
- if plan_name not in get_all_plans() and plan_name not in get_all_engines():
333
- raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
334
-
335
- return {
336
- "plan": plan_name,
337
- "upload_limit": get_effective_upload_limit(db, plan_name),
338
- "enabled_features": get_effective_features(db, plan_name)
339
- }
340
-
341
-
342
- @router.put("/features/{plan_name}")
343
- def update_plan_features(
344
- plan_name: str,
345
- request: FeatureToggleRequest,
346
- db: Session = Depends(get_db),
347
- current_user: User = Depends(get_current_admin)
348
- ):
349
- """
350
- Bulk update features for a plan.
351
- """
352
- from app.services.feature_service import bulk_set_features
353
- from app.core.plan_config import get_all_plans, get_all_engines
354
-
355
- if plan_name not in get_all_plans() and plan_name not in get_all_engines():
356
- raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
357
-
358
- count = bulk_set_features(db, plan_name, request.feature_states, current_user.id)
359
-
360
- return {
361
- "message": f"Updated {count} features for {plan_name}",
362
- "plan": plan_name,
363
- "updated_count": count
364
- }
365
-
366
-
367
- @router.post("/features/{plan_name}/reset")
368
- def reset_plan_features(
369
- plan_name: str,
370
- db: Session = Depends(get_db),
371
- current_user: User = Depends(get_current_admin)
372
- ):
373
- """
374
- Reset a plan's features to defaults (removes all overrides).
375
- """
376
- from app.services.feature_service import reset_plan_to_defaults
377
- from app.core.plan_config import get_all_plans, get_all_engines
378
-
379
- if plan_name not in get_all_plans() and plan_name not in get_all_engines():
380
- raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
381
-
382
- count = reset_plan_to_defaults(db, plan_name)
383
-
384
- return {
385
- "message": f"Reset {plan_name} to defaults, removed {count} overrides",
386
- "plan": plan_name,
387
- "removed_overrides": count
388
- }
389
-
390
-
391
- class UploadLimitRequest(BaseModel):
392
- upload_limit: int
393
-
394
-
395
- @router.put("/features/{plan_name}/limit")
396
- def update_plan_upload_limit(
397
- plan_name: str,
398
- request: UploadLimitRequest,
399
- db: Session = Depends(get_db),
400
- current_user: User = Depends(get_current_admin)
401
- ):
402
- """
403
- Update upload limit for a plan.
404
- """
405
- from app.models.feature_flags import PlanUploadLimit
406
- from app.core.plan_config import get_all_plans
407
-
408
- if plan_name not in get_all_plans():
409
- raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
410
-
411
- # Find or create limit override
412
- override = db.query(PlanUploadLimit).filter(
413
- PlanUploadLimit.plan_name == plan_name
414
- ).first()
415
-
416
- if override:
417
- override.upload_limit = request.upload_limit
418
- override.updated_by_id = current_user.id
419
- else:
420
- override = PlanUploadLimit(
421
- plan_name=plan_name,
422
- upload_limit=request.upload_limit,
423
- updated_by_id=current_user.id
424
- )
425
- db.add(override)
426
-
427
- db.commit()
428
-
429
- return {
430
- "message": f"Updated upload limit for {plan_name}",
431
- "plan": plan_name,
432
- "new_limit": request.upload_limit
433
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/api/auth.py DELETED
@@ -1,221 +0,0 @@
1
- from datetime import datetime, timedelta
2
- from typing import Optional
3
- from fastapi import APIRouter, Depends, HTTPException, status
4
- from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
5
- from jose import JWTError, jwt
6
- from passlib.context import CryptContext
7
- from sqlalchemy.orm import Session
8
- from app.core.database import get_db
9
- from app.models.user import User
10
- from app.schemas.user import UserCreate, UserResponse, Token, UpgradeRequest
11
- from app.core.security import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
12
-
13
- from app.core.security import verify_password, get_password_hash, create_access_token, ALGORITHM, SECRET_KEY, ACCESS_TOKEN_EXPIRE_MINUTES
14
-
15
- router = APIRouter(prefix="/auth", tags=["auth"])
16
-
17
- @router.get("/probe")
18
- def probe():
19
- return {"status": "auth_router_working"}
20
-
21
- oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
22
-
23
- @router.post("/register", response_model=UserResponse)
24
- def register(user: UserCreate, db: Session = Depends(get_db)):
25
- try:
26
- db_user = db.query(User).filter(User.email == user.email).first()
27
- if db_user:
28
- raise HTTPException(status_code=400, detail="Email already registered")
29
- hashed_password = get_password_hash(user.password)
30
-
31
- # Valid Admin Keys
32
- VALID_ADMIN_KEYS = [
33
- "VSQADM001", "VSQADM002", "VSQADM003",
34
- "VSQADM004", "VSQADM005", "VSQADM006"
35
- ]
36
-
37
- # Check Admin Key
38
- is_admin = False
39
- is_super_admin = False
40
- SUPER_ADMIN_KEYS = ["VSQADM003", "VSQADM006"]
41
-
42
- if user.admin_key and user.admin_key in VALID_ADMIN_KEYS:
43
- is_admin = True
44
- if user.admin_key in SUPER_ADMIN_KEYS:
45
- is_super_admin = True
46
-
47
- # Generate Visique ID
48
- # Generate Visique ID
49
- import uuid
50
- import random
51
- if is_admin:
52
- # VISI-###### (6 digits)
53
- digits = ''.join([str(random.randint(0, 9)) for _ in range(6)])
54
- visique_id = f"VISI-{digits}"
55
- else:
56
- visique_id = f"VSQ-{str(uuid.uuid4())[:8].upper()}"
57
-
58
- new_user = User(
59
- email=user.email,
60
- hashed_password=hashed_password,
61
- full_name=user.full_name,
62
- company_name=user.company_name,
63
- is_admin=is_admin,
64
- is_super_admin=is_super_admin,
65
- visique_id=visique_id
66
- )
67
- db.add(new_user)
68
- db.commit()
69
- db.refresh(new_user)
70
- return new_user
71
- except HTTPException as he:
72
- raise he
73
- except Exception as e:
74
- print(f"Registration Error: {str(e)}")
75
- raise HTTPException(status_code=500, detail=f"Registration failed: {str(e)}")
76
-
77
- @router.post("/login", response_model=Token)
78
- def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
79
- user = db.query(User).filter(User.email == form_data.username).first()
80
- if not user or not verify_password(form_data.password, user.hashed_password):
81
- raise HTTPException(
82
- status_code=status.HTTP_401_UNAUTHORIZED,
83
- detail="Incorrect username or password",
84
- headers={"WWW-Authenticate": "Bearer"},
85
- )
86
- access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
87
- access_token = create_access_token(
88
- data={"sub": user.email}, expires_delta=access_token_expires
89
- )
90
- return {"access_token": access_token, "token_type": "bearer"}
91
-
92
- async def get_current_user(
93
- token: Optional[str] = Depends(oauth2_scheme),
94
- db: Session = Depends(get_db),
95
- query_token: Optional[str] = None
96
- ):
97
- actual_token = token or query_token
98
-
99
- credentials_exception = HTTPException(
100
- status_code=status.HTTP_401_UNAUTHORIZED,
101
- detail="Could not validate credentials",
102
- headers={"WWW-Authenticate": "Bearer"},
103
- )
104
- if not actual_token:
105
- raise credentials_exception
106
-
107
- try:
108
- payload = jwt.decode(actual_token, SECRET_KEY, algorithms=[ALGORITHM])
109
- email: str = payload.get("sub")
110
- if email is None:
111
- raise credentials_exception
112
- except JWTError:
113
- raise credentials_exception
114
- user = db.query(User).filter(User.email == email).first()
115
- if user is None:
116
- raise credentials_exception
117
- return user
118
-
119
- @router.get("/me", response_model=UserResponse)
120
- async def read_users_me(current_user: User = Depends(get_current_user)):
121
- return current_user
122
-
123
- from app.core.config import settings
124
- from app.core.stripe_config import create_checkout_session
125
- import stripe
126
- from fastapi import Request
127
-
128
- @router.post("/create-checkout-session")
129
- def create_payment(
130
- plan_id: str, # Pass the Stripe Price ID
131
- current_user: User = Depends(get_current_user),
132
- db: Session = Depends(get_db)
133
- ):
134
- session = create_checkout_session(current_user, plan_id)
135
- if not session:
136
- raise HTTPException(status_code=400, detail="Error creating payment session")
137
- return {"url": session.url}
138
-
139
- @router.post("/webhook")
140
- async def stripe_webhook(request: Request, db: Session = Depends(get_db)):
141
- payload = await request.body()
142
- sig_header = request.headers.get("stripe-signature")
143
-
144
- try:
145
- event = stripe.Webhook.construct_event(
146
- payload, sig_header, settings.STRIPE_WEBHOOK_SECRET
147
- )
148
- except ValueError as e:
149
- raise HTTPException(status_code=400, detail="Invalid payload")
150
- except stripe.error.SignatureVerificationError as e:
151
- raise HTTPException(status_code=400, detail="Invalid signature")
152
-
153
- if event["type"] == "checkout.session.completed":
154
- session = event["data"]["object"]
155
-
156
- # Retrieve user and update plan
157
- # Note: metadata values are strings
158
- user_id = session.get("client_reference_id")
159
- if user_id:
160
- user = db.query(User).filter(User.id == int(user_id)).first()
161
- if user:
162
- user.plan = "Business" # Or derive from session
163
- user.plan_expires_at = datetime.utcnow() + timedelta(days=30)
164
-
165
- # Record Payment
166
- from app.models.user import Payment
167
- new_payment = Payment(
168
- user_id=user.id,
169
- amount=session.get("amount_total", 0) / 100.0,
170
- status="paid",
171
- plan_name="Business",
172
- date=datetime.utcnow()
173
- )
174
- db.add(new_payment)
175
- db.commit()
176
-
177
- return {"status": "success"}
178
-
179
- from typing import List
180
- from app.schemas.user import PaymentResponse
181
- from app.models.user import Payment
182
-
183
- @router.get("/payments/me", response_model=List[PaymentResponse])
184
- def read_my_payments(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
185
- return db.query(Payment).filter(Payment.user_id == current_user.id).all()
186
-
187
- from fastapi import UploadFile, File
188
- import shutil
189
- import os
190
-
191
- @router.post("/me/avatar")
192
- async def upload_avatar(
193
- file: UploadFile = File(...),
194
- current_user: User = Depends(get_current_user),
195
- db: Session = Depends(get_db)
196
- ):
197
- # Determine file extension
198
- ext = file.filename.split(".")[-1]
199
- if ext.lower() not in ["jpg", "jpeg", "png", "webp"]:
200
- raise HTTPException(status_code=400, detail="Invalid image format. Use JPG, PNG, or WebP.")
201
-
202
- # Save file
203
- filename = f"avatar_{current_user.id}_{file.filename}"
204
- upload_dir = "uploads/avatars"
205
- if not os.path.exists(upload_dir):
206
- os.makedirs(upload_dir)
207
-
208
- file_path = os.path.join(upload_dir, filename)
209
- with open(file_path, "wb+") as buffer:
210
- shutil.copyfileobj(file.file, buffer)
211
-
212
- # Update User Profile
213
- # Store relative path or full? Relative to allow frontend to fetch via static mount
214
- # Assuming we mount /uploads as /static/uploads or similar
215
- # For now, store relative path "uploads/avatars/..."
216
- current_user.profile_picture_url = f"/api/v1/static/avatars/{filename}"
217
-
218
- db.commit()
219
- db.refresh(current_user)
220
-
221
- return {"message": "Avatar updated", "url": current_user.profile_picture_url}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/api/endpoints.py DELETED
@@ -1,742 +0,0 @@
1
- from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
- from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
3
- from app.schemas.financial import StandardizedDataPackage
4
- from app.core.security import create_access_token
5
- from typing import Annotated
6
- from pydantic import BaseModel
7
- from datetime import date
8
- import os
9
- from app.services.ingestion.parser_csv import CSVParser
10
- from app.services.ingestion.parser_pdf import PDFParser
11
- from app.services.analysis.kpi import KPIAnalyzer
12
- from app.services.analysis.risk import RiskAnalyzer
13
- from app.services.analysis.health_score import HealthScoreAnalyzer
14
- from app.services.analysis.fundamental import FundamentalAnalyzer
15
- from app.services.analysis.factory import AnalysisFactory
16
- from app.services.analysis.growth import GrowthAnalyzer
17
- from app.services.analysis.simulation import SimulationService
18
- from app.services.reporting.pdf_report import PDFReporter
19
- from app.services.reporting.pptx_report import PPTXReporter
20
- from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
21
- from app.schemas.chat import ChatRequest, ChatResponse
22
- from app.api.auth import get_current_user
23
- from app.models.user import User, Analysis
24
- from app.core.database import get_db
25
- from sqlalchemy.orm import Session
26
- import json
27
- from fastapi.responses import FileResponse
28
- from app.services.feature_service import get_effective_features
29
-
30
- router = APIRouter(prefix="/analysis", tags=["analysis"])
31
- oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
32
-
33
- @router.post("/token")
34
- async def login(form_data: Annotated[OAuth2PasswordRequestForm, Depends()]):
35
- # Mock User DB (kept for legacy demo, but real auth is at /auth/login)
36
- if form_data.username == "analyst" and form_data.password == "visique":
37
- return {"access_token": create_access_token(data={"sub": form_data.username}), "token_type": "bearer"}
38
- raise HTTPException(status_code=400, detail="Incorrect username or password")
39
-
40
- # Admin Dependency
41
- def get_current_admin(current_user: User = Depends(get_current_user)):
42
- if not current_user.is_admin:
43
- raise HTTPException(status_code=403, detail="Admin privileges required")
44
- return current_user
45
- if not current_user.is_admin:
46
- raise HTTPException(status_code=403, detail="Admin privileges required")
47
- return current_user
48
-
49
- @router.get("/admin/users")
50
- def get_all_users(
51
- admin: User = Depends(get_current_admin),
52
- db: Session = Depends(get_db)
53
- ):
54
- users = db.query(User).all()
55
- return [
56
- {
57
- "id": u.id,
58
- "email": u.email,
59
- "full_name": u.full_name,
60
- "company_name": u.company_name,
61
- "is_admin": u.is_admin,
62
- "created_at": u.created_at,
63
- "analysis_count": len(u.analyses),
64
- "preferred_engine": getattr(u, "preferred_engine", "v1")
65
- }
66
- for u in users
67
- ]
68
-
69
- @router.get("/admin/analyses")
70
- def get_all_analyses(
71
- admin: User = Depends(get_current_admin),
72
- db: Session = Depends(get_db)
73
- ):
74
- analyses = db.query(Analysis).order_by(Analysis.timestamp.desc()).all()
75
- return [
76
- {
77
- "id": a.id,
78
- "user_email": a.owner.email,
79
- "user_company": a.owner.company_name,
80
- "company_name": a.company_name,
81
- "filename": a.input_filename,
82
- "timestamp": a.timestamp,
83
- }
84
- for a in analyses
85
- ]
86
-
87
- @router.get("/admin/analyses/{analysis_id}/download")
88
- def admin_download_file(
89
- analysis_id: int,
90
- admin: User = Depends(get_current_admin),
91
- db: Session = Depends(get_db)
92
- ):
93
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
94
- if not analysis or not analysis.stored_filename:
95
- raise HTTPException(status_code=404, detail="File not found")
96
-
97
- if not os.path.exists(analysis.stored_filename):
98
- raise HTTPException(status_code=404, detail="File missing from server storage")
99
-
100
- return FileResponse(
101
- path=analysis.stored_filename,
102
- filename=f"ADMIN_EXPORT_{analysis.input_filename}",
103
- media_type='application/octet-stream'
104
- )
105
-
106
-
107
- import json
108
-
109
- # Admin Dependency
110
- def get_current_admin(current_user: User = Depends(get_current_user)):
111
- if not current_user.is_admin:
112
- raise HTTPException(status_code=403, detail="Admin privileges required")
113
- return current_user
114
-
115
- @router.post("/upload/csv", response_model=StandardizedDataPackage)
116
- async def analyze_csv(
117
- file: UploadFile = File(...),
118
- current_user: User = Depends(get_current_user),
119
- db: Session = Depends(get_db)
120
- ):
121
- # Check upload limit
122
- from app.services.feature_service import check_upload_limit, increment_upload_count
123
- limit_check = check_upload_limit(db, current_user)
124
- if not limit_check["can_upload"]:
125
- raise HTTPException(
126
- status_code=403,
127
- detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
128
- )
129
-
130
- if not file.filename.endswith('.csv'):
131
- raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .csv file.")
132
-
133
- # Secure filename and path
134
- import uuid
135
- safe_filename = f"{uuid.uuid4()}_{file.filename}"
136
- upload_dir = "uploads"
137
- if not os.path.exists(upload_dir):
138
- os.makedirs(upload_dir)
139
-
140
- file_path = os.path.join(upload_dir, safe_filename)
141
-
142
- try:
143
- with open(file_path, "wb+") as file_object:
144
- while content := await file.read(1024 * 1024): # Stream 1MB chunks
145
- file_object.write(content)
146
-
147
- report = CSVParser.parse(file_path)
148
-
149
- # Run Unified Analysis (includes Phase 2 & 3 extensions)
150
- # Select Engine based on User Preference
151
- analyzer = AnalysisFactory.get_analyzer(current_user)
152
- # Fetch enabled features for user's plan
153
- enabled_features = get_effective_features(db, current_user.plan or "Free")
154
- analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
155
-
156
- # The analyze() method returns: kpis, health_score, risk_analysis, insights (industry), recommendations, variance, runway, optimization
157
-
158
- # Combine industry insights + recommendations + manual pain points if needed
159
- # Note: FundamentalAnalyzer.analyze now handles most of this, but 'pain points' logic is inside recommendations or separate?
160
-
161
- # Combine text insights
162
- # Include risk_factors (which contain "Pain Point:" entries) in the insights array
163
- risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
164
- all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
165
-
166
- result_package = StandardizedDataPackage(
167
- raw_data=report,
168
- kpis=analysis_result["kpis"],
169
- risk_analysis=analysis_result["risk_analysis"],
170
- health_score=analysis_result["health_score"],
171
- insights=all_insights,
172
- runway_forecast=analysis_result["runway_forecast"],
173
- optimization_insights=analysis_result["optimization_insights"],
174
- geo_analysis=analysis_result.get("geo_analysis")
175
- )
176
-
177
- # Save to DB
178
- db_analysis = Analysis(
179
- user_id=current_user.id,
180
- company_name=report.company_name,
181
- input_filename=file.filename,
182
- stored_filename=file_path,
183
- result_json=result_package.json()
184
- )
185
- db.add(db_analysis)
186
- db.commit()
187
- db.refresh(db_analysis)
188
-
189
- result_package.analysis_id = db_analysis.id
190
-
191
- # Increment upload count AFTER successful save
192
- increment_upload_count(db, current_user)
193
-
194
- return result_package
195
-
196
- except Exception as e:
197
- # Cleanup if analysis fails, but keep if successful
198
- if os.path.exists(file_path):
199
- os.remove(file_path)
200
- raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
201
-
202
- @router.post("/save")
203
- async def save_analysis_result(
204
- payload: dict,
205
- current_user: User = Depends(get_current_user),
206
- db: Session = Depends(get_db)
207
- ):
208
- """
209
- Receives pre-computed analysis results from Vercel serverless functions
210
- and persists them to the database. This endpoint does NOT run analysis -
211
- it only handles authentication and database storage.
212
- """
213
- try:
214
- company_name = "Unknown"
215
- raw_data = payload.get("raw_data", {})
216
- if isinstance(raw_data, dict):
217
- company_name = raw_data.get("company_name", "Unknown")
218
-
219
- original_filename = payload.pop("original_filename", "uploaded_file")
220
-
221
- db_analysis = Analysis(
222
- user_id=current_user.id,
223
- company_name=company_name,
224
- input_filename=original_filename,
225
- stored_filename="vercel_processed",
226
- result_json=json.dumps(payload)
227
- )
228
- db.add(db_analysis)
229
- db.commit()
230
- db.refresh(db_analysis)
231
-
232
- # Increment upload count
233
- increment_upload_count(db, current_user)
234
-
235
- return {"status": "saved", "analysis_id": db_analysis.id}
236
- except Exception as e:
237
- raise HTTPException(status_code=500, detail=f"Failed to save analysis: {str(e)}")
238
-
239
- @router.get("/history")
240
- def get_history(
241
- current_user: User = Depends(get_current_user),
242
- db: Session = Depends(get_db)
243
- ):
244
- analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).order_by(Analysis.timestamp.desc()).all()
245
- return [
246
- {
247
- "id": a.id,
248
- "company_name": a.company_name,
249
- "filename": a.input_filename,
250
- "timestamp": a.timestamp,
251
- # We don't return full JSON here to keep it light, create separate endpoint for details if needed
252
- }
253
- for a in analyses
254
- ]
255
-
256
- @router.get("/history/{analysis_id}", response_model=StandardizedDataPackage)
257
- def get_analysis_detail(
258
- analysis_id: int,
259
- current_user: User = Depends(get_current_user),
260
- db: Session = Depends(get_db)
261
- ):
262
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
263
- if not analysis:
264
- raise HTTPException(status_code=404, detail="Analysis not found")
265
-
266
- pkg = StandardizedDataPackage.parse_raw(analysis.result_json)
267
- pkg.analysis_id = analysis.id
268
- return pkg
269
-
270
- from fastapi.responses import FileResponse
271
- @router.get("/history/{analysis_id}/download")
272
- def download_original_file(
273
- analysis_id: int,
274
- current_user: User = Depends(get_current_user),
275
- db: Session = Depends(get_db)
276
- ):
277
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
278
- if not analysis or not analysis.stored_filename:
279
- raise HTTPException(status_code=404, detail="File not found")
280
-
281
- if not os.path.exists(analysis.stored_filename):
282
- raise HTTPException(status_code=404, detail="File missing from server storage")
283
-
284
- return FileResponse(
285
- path=analysis.stored_filename,
286
- filename=analysis.input_filename,
287
- media_type='application/octet-stream'
288
- )
289
-
290
-
291
- @router.delete("/history/{analysis_id}")
292
- def delete_analysis(
293
- analysis_id: int,
294
- current_user: User = Depends(get_current_user),
295
- db: Session = Depends(get_db)
296
- ):
297
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
298
- if not analysis:
299
- raise HTTPException(status_code=404, detail="Analysis not found")
300
-
301
- # Delete file from disk
302
- if analysis.stored_filename and os.path.exists(analysis.stored_filename):
303
- try:
304
- os.remove(analysis.stored_filename)
305
- except OSError:
306
- pass # Continue even if file delete fails
307
-
308
- db.delete(analysis)
309
- db.commit()
310
- return {"status": "success", "message": "Analysis deleted"}
311
-
312
- class UpdateAnalysisRequest(BaseModel):
313
- company_name: str
314
-
315
- @router.patch("/history/{analysis_id}")
316
- def update_analysis(
317
- analysis_id: int,
318
- request: UpdateAnalysisRequest,
319
- current_user: User = Depends(get_current_user),
320
- db: Session = Depends(get_db)
321
- ):
322
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
323
- if not analysis:
324
- raise HTTPException(status_code=404, detail="Analysis not found")
325
-
326
- analysis.company_name = request.company_name
327
-
328
- # Update the stored JSON to reflect new name (consistency)
329
- try:
330
- data = json.loads(analysis.result_json)
331
- data['raw_data']['company_name'] = request.company_name
332
- analysis.result_json = json.dumps(data)
333
- except:
334
- pass # If JSON parsing fails, just update DB record
335
-
336
- db.commit()
337
- return {"status": "success", "message": "Analysis updated", "company_name": analysis.company_name}
338
-
339
- @router.post("/upload/pdf", response_model=StandardizedDataPackage)
340
- async def analyze_pdf(
341
- file: UploadFile = File(...),
342
- current_user: User = Depends(get_current_user),
343
- db: Session = Depends(get_db)
344
- ):
345
- # Check upload limit
346
- from app.services.feature_service import check_upload_limit, increment_upload_count
347
- limit_check = check_upload_limit(db, current_user)
348
- if not limit_check["can_upload"]:
349
- raise HTTPException(
350
- status_code=403,
351
- detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
352
- )
353
-
354
- if not file.filename.endswith('.pdf'):
355
- raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .pdf file.")
356
-
357
- import uuid
358
- safe_filename = f"{uuid.uuid4()}_{file.filename}"
359
- upload_dir = "uploads"
360
- if not os.path.exists(upload_dir):
361
- os.makedirs(upload_dir)
362
-
363
- file_path = os.path.join(upload_dir, safe_filename)
364
-
365
- try:
366
- with open(file_path, "wb+") as file_object:
367
- file_object.write(await file.read())
368
-
369
- report = PDFParser.parse(file_path)
370
-
371
- # Run Unified Analysis
372
- # Select Engine based on User Preference
373
- analyzer = AnalysisFactory.get_analyzer(current_user)
374
-
375
- # Resolve all feature flags (Plan + Custom + Engine limits)
376
- from app.services.feature_service import resolve_user_features
377
- enabled_features = resolve_user_features(db, current_user)
378
-
379
- analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
380
-
381
- # Include risk_factors (which contain "Pain Point:" entries) in the insights array
382
- risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
383
- all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
384
-
385
- result_package = StandardizedDataPackage(
386
- raw_data=report,
387
- kpis=analysis_result["kpis"],
388
- risk_analysis=analysis_result["risk_analysis"],
389
- health_score=analysis_result["health_score"],
390
- insights=all_insights,
391
- runway_forecast=analysis_result["runway_forecast"],
392
- optimization_insights=analysis_result["optimization_insights"],
393
- geo_analysis=analysis_result.get("geo_analysis")
394
- )
395
-
396
- # Save to DB
397
- db_analysis = Analysis(
398
- user_id=current_user.id,
399
- company_name=report.company_name,
400
- input_filename=file.filename,
401
- stored_filename=file_path,
402
- result_json=result_package.json()
403
- )
404
- db.add(db_analysis)
405
- db.commit()
406
- db.refresh(db_analysis)
407
-
408
- result_package.analysis_id = db_analysis.id
409
-
410
- # Increment upload count AFTER successful save
411
- increment_upload_count(db, current_user)
412
-
413
- return result_package
414
-
415
- except Exception as e:
416
- if os.path.exists(file_path):
417
- os.remove(file_path)
418
- raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
419
-
420
-
421
- # =============================================================================
422
- # XLSX UPLOAD ENDPOINT
423
- # =============================================================================
424
-
425
- @router.post("/upload/xlsx", response_model=StandardizedDataPackage)
426
- async def analyze_xlsx(
427
- file: UploadFile = File(...),
428
- current_user: User = Depends(get_current_user),
429
- db: Session = Depends(get_db)
430
- ):
431
- """Upload and analyze an Excel (.xlsx, .xls) file."""
432
- # Check upload limit
433
- from app.services.feature_service import check_upload_limit, increment_upload_count
434
- limit_check = check_upload_limit(db, current_user)
435
- if not limit_check["can_upload"]:
436
- raise HTTPException(
437
- status_code=403,
438
- detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
439
- )
440
-
441
- if not (file.filename.endswith('.xlsx') or file.filename.endswith('.xls')):
442
- raise HTTPException(status_code=400, detail="Invalid file type. Please upload an .xlsx or .xls file.")
443
-
444
- import uuid
445
- safe_filename = f"{uuid.uuid4()}_{file.filename}"
446
- upload_dir = "uploads"
447
- if not os.path.exists(upload_dir):
448
- os.makedirs(upload_dir)
449
-
450
- file_path = os.path.join(upload_dir, safe_filename)
451
-
452
- try:
453
- with open(file_path, "wb+") as file_object:
454
- file_object.write(await file.read())
455
-
456
- # Use XLSX Parser
457
- from app.services.ingestion.parser_xlsx import XLSXParser
458
- report = XLSXParser.parse(file_path)
459
-
460
- # Run Unified Analysis
461
- # Select Engine based on User Preference
462
- analyzer = AnalysisFactory.get_analyzer(current_user)
463
-
464
- # Resolve all feature flags (Plan + Custom + Engine limits)
465
- from app.services.feature_service import resolve_user_features
466
- enabled_features = resolve_user_features(db, current_user)
467
-
468
- analysis_result = analyzer.analyze(report, user_address=current_user.address, enabled_features=enabled_features)
469
-
470
- risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
471
- all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
472
-
473
- result_package = StandardizedDataPackage(
474
- raw_data=report,
475
- kpis=analysis_result["kpis"],
476
- risk_analysis=analysis_result["risk_analysis"],
477
- health_score=analysis_result["health_score"],
478
- insights=all_insights,
479
- runway_forecast=analysis_result["runway_forecast"],
480
- optimization_insights=analysis_result["optimization_insights"],
481
- geo_analysis=analysis_result.get("geo_analysis")
482
- )
483
-
484
- # Save to DB
485
- db_analysis = Analysis(
486
- user_id=current_user.id,
487
- company_name=report.company_name,
488
- input_filename=file.filename,
489
- stored_filename=file_path,
490
- result_json=result_package.json()
491
- )
492
- db.add(db_analysis)
493
- db.commit()
494
- db.refresh(db_analysis)
495
-
496
- result_package.analysis_id = db_analysis.id
497
-
498
- # Increment upload count
499
- increment_upload_count(db, current_user)
500
-
501
- return result_package
502
-
503
- except Exception as e:
504
- if os.path.exists(file_path):
505
- os.remove(file_path)
506
- raise HTTPException(status_code=500, detail=f"XLSX Analysis failed: {str(e)}")
507
-
508
-
509
- # =============================================================================
510
- # BULK DELETE ENDPOINTS
511
- # =============================================================================
512
-
513
- class BulkDeleteRequest(BaseModel):
514
- ids: list[int]
515
-
516
- @router.delete("/history/bulk-delete")
517
- def bulk_delete_analyses(
518
- request: BulkDeleteRequest,
519
- current_user: User = Depends(get_current_user),
520
- db: Session = Depends(get_db)
521
- ):
522
- """Delete multiple analyses at once."""
523
- deleted_count = 0
524
- errors = []
525
-
526
- for analysis_id in request.ids:
527
- analysis = db.query(Analysis).filter(
528
- Analysis.id == analysis_id,
529
- Analysis.user_id == current_user.id
530
- ).first()
531
-
532
- if not analysis:
533
- errors.append(f"Analysis {analysis_id} not found")
534
- continue
535
-
536
- # Delete file from disk
537
- if analysis.stored_filename and os.path.exists(analysis.stored_filename):
538
- try:
539
- os.remove(analysis.stored_filename)
540
- except OSError:
541
- pass
542
-
543
- db.delete(analysis)
544
- deleted_count += 1
545
-
546
- db.commit()
547
-
548
- return {
549
- "status": "success",
550
- "deleted_count": deleted_count,
551
- "errors": errors if errors else None
552
- }
553
-
554
-
555
- class DateRangeDeleteRequest(BaseModel):
556
- start_date: str # YYYY-MM-DD
557
- end_date: str # YYYY-MM-DD
558
-
559
- @router.delete("/history/delete-range")
560
- def delete_analyses_in_range(
561
- request: DateRangeDeleteRequest,
562
- current_user: User = Depends(get_current_user),
563
- db: Session = Depends(get_db)
564
- ):
565
- """Delete all analyses within a date range."""
566
- from datetime import datetime
567
-
568
- try:
569
- start = datetime.strptime(request.start_date, "%Y-%m-%d")
570
- end = datetime.strptime(request.end_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59)
571
- except ValueError:
572
- raise HTTPException(status_code=400, detail="Invalid date format. Use YYYY-MM-DD.")
573
-
574
- # Find analyses in range
575
- analyses = db.query(Analysis).filter(
576
- Analysis.user_id == current_user.id,
577
- Analysis.timestamp >= start,
578
- Analysis.timestamp <= end
579
- ).all()
580
-
581
- deleted_count = 0
582
- for analysis in analyses:
583
- if analysis.stored_filename and os.path.exists(analysis.stored_filename):
584
- try:
585
- os.remove(analysis.stored_filename)
586
- except OSError:
587
- pass
588
- db.delete(analysis)
589
- deleted_count += 1
590
-
591
- db.commit()
592
-
593
- return {
594
- "status": "success",
595
- "deleted_count": deleted_count,
596
- "date_range": f"{request.start_date} to {request.end_date}"
597
- }
598
-
599
- class SimulationRequest(BaseModel):
600
- data: StandardizedDataPackage
601
- delta_revenue: float = 0.0
602
- delta_cogs: float = 0.0
603
- delta_payroll: float = 0.0
604
- delta_marketing: float = 0.0
605
- delta_fixed_costs: float = 0.0
606
-
607
- @router.post("/simulate", response_model=StandardizedDataPackage)
608
- async def run_simulation(request: SimulationRequest, user: str = Depends(get_current_user)):
609
- return SimulationService.run_simulation(
610
- original_data=request.data.raw_data,
611
- delta_revenue_percent=request.delta_revenue,
612
- delta_cogs_percent=request.delta_cogs,
613
- delta_payroll_percent=request.delta_payroll,
614
- delta_marketing_percent=request.delta_marketing,
615
- delta_fixed_costs_percent=request.delta_fixed_costs
616
- )
617
-
618
- @router.get("/history/{analysis_id}/export/pdf")
619
- def export_analysis_pdf(
620
- analysis_id: int,
621
- current_user: User = Depends(get_current_user),
622
- db: Session = Depends(get_db)
623
- ):
624
- from fastapi.responses import FileResponse
625
-
626
- analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
627
- if not analysis:
628
- raise HTTPException(status_code=404, detail="Analysis not found")
629
-
630
- # parse stored json
631
- try:
632
- data = StandardizedDataPackage.parse_raw(analysis.result_json)
633
- except Exception as e:
634
- raise HTTPException(status_code=500, detail=f"Data corruption: {str(e)}")
635
-
636
- # Generate PDF
637
- # We use /tmp for now, simplified
638
- safe_name = "".join(x for x in data.raw_data.company_name if x.isalnum() or x in " _-")
639
- filename = f"/tmp/{safe_name}_{analysis.id}_report.pdf"
640
-
641
- PDFReporter.generate(data, filename)
642
-
643
- from datetime import datetime
644
- date_str = datetime.now().strftime("%Y-%m-%d")
645
- return FileResponse(filename, media_type='application/pdf', filename=f"Visi-Insight Report - {data.raw_data.company_name} - {date_str}.pdf")
646
-
647
- @router.post("/ai-cfo", response_model=str)
648
- async def get_ai_summary(data: StandardizedDataPackage, user: str = Depends(get_current_user)):
649
- from app.services.intelligence.ai_cfo import AICFOService
650
- return AICFOService.generate_executive_summary(data)
651
-
652
- @router.post("/chat", response_model=ChatResponse)
653
- async def chat_with_data(request: ChatRequest, user: str = Depends(get_current_user)):
654
- # Note: In a real app, 'data_context' would be retrieved from a session or vector DB
655
- # For this stateless scaffold, we assume we want to query a mock global context or previously uploaded file.
656
- # To keep it simple for the frontend demo, we will accept the data in the request or just mock the context access
657
- # since we don't have a persistent session store implemented yet.
658
-
659
- # Check if a file was recently uploaded (using a global for demo simplicity, or pass mock)
660
- # Ideally, we'd pass the DataPackage in the request, but it's too big.
661
- # We will instantiate a dummy context if none exists, or rely on client sending relevant context.
662
-
663
- # PROPER IMPLEMENTATION:
664
- # 1. User uploads file -> Backend stores Vector Index ID in User Session.
665
- # 2. /chat -> retrieves Index ID -> Queries Vector DB.
666
-
667
- # MOCK IMPLEMENTATION:
668
- from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
669
- from datetime import date
670
-
671
- # Create a dummy context for the scaffold to prove the endpoint works
672
- # In production, this would be retrieved from session or vector DB
673
- dummy_data = StandardizedDataPackage(
674
- raw_data=FinancialReport(
675
- company_name="Demo Corp",
676
- period_end=date.today(),
677
- income_statement=IncomeStatementStandard(revenue=1200000, net_income=240000, cogs=600000),
678
- balance_sheet=BalanceSheetStandard(),
679
- cash_flow=CashFlowStandard()
680
- ),
681
- kpis=KPIMetrics(net_margin=20.0),
682
- risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
683
- health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
684
- insights=["Automated Report Generation Successful"],
685
- optimization_insights=None # Should be populated normally
686
- )
687
-
688
- from app.services.intelligence.gemini_service import GeminiService
689
- return GeminiService.query(request, dummy_data)
690
-
691
- @router.get("/export/pptx/{company_name}")
692
- async def export_pptx(company_name: str):
693
- from fastapi.responses import FileResponse
694
-
695
- dummy_data = StandardizedDataPackage(
696
- raw_data=FinancialReport(
697
- company_name=company_name,
698
- period_end=date.today(),
699
- income_statement=IncomeStatementStandard(revenue=1000000, net_income=200000, cogs=500000),
700
- balance_sheet=BalanceSheetStandard(),
701
- cash_flow=CashFlowStandard()
702
- ),
703
- kpis=KPIMetrics(net_margin=20.0),
704
- risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
705
- health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
706
- insights=["Automated Report Generation Successful"]
707
- )
708
-
709
- filename = f"/tmp/{company_name}_presentation.pptx"
710
- PPTXReporter.generate(dummy_data, filename)
711
-
712
- return FileResponse(filename, media_type='application/vnd.openxmlformats-officedocument.presentationml.presentation', filename=f"{company_name}_presentation.pptx")
713
-
714
- class EngineUpdate(BaseModel):
715
- engine: str
716
-
717
- @router.put("/admin/users/{user_id}/engine")
718
- def update_user_engine(
719
- user_id: int,
720
- update: EngineUpdate,
721
- admin: User = Depends(get_current_admin),
722
- db: Session = Depends(get_db)
723
- ):
724
- user = db.query(User).filter(User.id == user_id).first()
725
- if not user:
726
- raise HTTPException(status_code=404, detail="User not found")
727
-
728
- if update.engine not in ["v1", "v2"]:
729
- raise HTTPException(status_code=400, detail="Invalid engine. Use 'v1' or 'v2'.")
730
-
731
- user.preferred_engine = update.engine
732
- db.commit()
733
- return {"status": "success", "engine": user.preferred_engine}
734
-
735
- @router.get("/public-config")
736
- def get_public_config(db: Session = Depends(get_db)):
737
- """Get configuration for Guest/Public users."""
738
- from app.services.feature_service import get_effective_features
739
- return {
740
- "guest_features": get_effective_features(db, "Guest"),
741
- "upload_limit": 2
742
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/__init__.py DELETED
@@ -1,52 +0,0 @@
1
- """
2
- Core Configuration Package
3
-
4
- This package contains application-wide configuration and utilities.
5
-
6
- ## Modules
7
-
8
- - `config.py` - Environment variables and settings
9
- - `database.py` - SQLAlchemy engine and session
10
- - `security.py` - JWT token creation/validation
11
- - `feature_registry.py` - Centralized feature definitions (auto-discoverable)
12
- - `plan_config.py` - Plan limits and default feature sets
13
-
14
- ## Feature System Architecture
15
-
16
- The feature system uses a layered approach:
17
-
18
- 1. **Feature Registry** (`feature_registry.py`)
19
- - Defines ALL controllable features
20
- - Features auto-appear in admin console
21
- - Organized by category for easy navigation
22
-
23
- 2. **Plan Config** (`plan_config.py`)
24
- - Default features per plan tier
25
- - Upload limits per plan
26
- - Wildcard "*" for unlimited access
27
-
28
- 3. **Admin Overrides** (via `models/feature_flags.py`)
29
- - Stored in database
30
- - Takes precedence over defaults
31
- - Managed via admin API
32
-
33
- ## Adding New Features
34
-
35
- ```python
36
- # In feature_registry.py, add to FEATURE_REGISTRY:
37
- Feature(
38
- id="new_feature_id",
39
- name="New Feature Name",
40
- description="What this feature does",
41
- category=FeatureCategory.CORE_METRICS # Pick appropriate category
42
- )
43
- ```
44
-
45
- The feature will automatically:
46
- - Appear in admin console UI
47
- - Be toggleable per plan
48
- - Respect plan defaults until overridden
49
- """
50
-
51
- from app.core.feature_registry import FEATURE_REGISTRY, Feature, FeatureCategory
52
- from app.core.plan_config import PLAN_DEFAULTS, get_plan_config, get_default_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/config.py DELETED
@@ -1,59 +0,0 @@
1
- from pydantic_settings import BaseSettings, SettingsConfigDict
2
- from pydantic import field_validator
3
- from typing import List, Union, Optional
4
-
5
- class Settings(BaseSettings):
6
- # Application Config
7
- PROJECT_NAME: str = "Visique API"
8
- VERSION: str = "0.1.0"
9
- API_V1_STR: str = "/api/v1"
10
-
11
- # Security
12
- SECRET_KEY: str # Required in production
13
- ALGORITHM: str = "HS256"
14
- ACCESS_TOKEN_EXPIRE_MINUTES: int = 1440 # 24 hours for better UX
15
-
16
- # Database
17
- DATABASE_URL: str # PostgreSQL URL
18
-
19
- # CORS
20
- ALLOWED_ORIGINS: Union[List[str], str] = [
21
- "http://localhost:3000",
22
- "http://127.0.0.1:3000",
23
- "https://visique-testing.vercel.app",
24
- "https://visique-frontend.vercel.app"
25
- ]
26
-
27
- @field_validator("ALLOWED_ORIGINS", mode="before")
28
- @classmethod
29
- def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]:
30
- if isinstance(v, str) and not v.startswith("["):
31
- return [i.strip() for i in v.split(",")]
32
- elif isinstance(v, str) and v.startswith("["):
33
- import json
34
- return json.loads(v)
35
- elif isinstance(v, list):
36
- return v
37
- raise ValueError(v)
38
-
39
- # Stripe
40
- STRIPE_SECRET_KEY: Optional[str] = None
41
- STRIPE_PUBLISHABLE_KEY: Optional[str] = None
42
- STRIPE_WEBHOOK_SECRET: Optional[str] = None
43
-
44
- # Deployment
45
- ENVIRONMENT: str = "development"
46
-
47
- # Dolphin PDF Extraction
48
- DOLPHIN_MODEL_PATH: Optional[str] = None # Auto-downloads if None
49
- DOLPHIN_DEVICE: str = "auto" # "auto" (CUDA > MPS > CPU) | "cuda" | "mps" | "cpu"
50
- DOLPHIN_MAX_BATCH_SIZE: int = 4
51
- DOLPHIN_AUTO_DOWNLOAD: bool = True
52
-
53
- # Dolphin Remote Service (Optional - for distributed setup)
54
- DOLPHIN_API_URL: Optional[str] = None
55
- DOLPHIN_API_KEY: Optional[str] = None
56
-
57
- model_config = SettingsConfigDict(env_file=".env", case_sensitive=True, extra="ignore")
58
-
59
- settings = Settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/database.py DELETED
@@ -1,26 +0,0 @@
1
- from sqlalchemy import create_engine
2
- from sqlalchemy.ext.declarative import declarative_base
3
- from sqlalchemy.orm import sessionmaker
4
-
5
- from app.core.config import settings
6
-
7
- SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL
8
-
9
- # Fix for Render/SQLAlchemy postgres:// scheme
10
- if SQLALCHEMY_DATABASE_URL.startswith("postgres://"):
11
- SQLALCHEMY_DATABASE_URL = SQLALCHEMY_DATABASE_URL.replace("postgres://", "postgresql://", 1)
12
-
13
- engine = create_engine(
14
- SQLALCHEMY_DATABASE_URL,
15
- pool_pre_ping=True
16
- )
17
- SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
18
-
19
- Base = declarative_base()
20
-
21
- def get_db():
22
- db = SessionLocal()
23
- try:
24
- yield db
25
- finally:
26
- db.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/feature_registry.py DELETED
@@ -1,255 +0,0 @@
1
- """
2
- Feature Registry - Auto-Discoverable Feature System
3
-
4
- Add new features here and they will automatically appear in the admin console.
5
- Each feature belongs to a category and can be toggled per plan.
6
- """
7
-
8
- from enum import Enum
9
- from dataclasses import dataclass, field
10
- from typing import List, Dict, Optional
11
-
12
-
13
- class FeatureCategory(Enum):
14
- """Categories for organizing features in admin console"""
15
- CORE_METRICS = "Core Metrics"
16
- RISK_ANALYSIS = "Risk Analysis"
17
- FORECASTING = "Forecasting"
18
- AI_INTELLIGENCE = "AI Intelligence"
19
- INTERACTIVE = "Interactive Tools"
20
- EXPORTS = "Exports & Reports"
21
-
22
-
23
- @dataclass
24
- class Feature:
25
- """
26
- Represents a controllable feature in the system.
27
-
28
- Attributes:
29
- id: Unique identifier used in code and API
30
- name: Human-readable name for admin console
31
- category: Grouping category
32
- description: Brief description of the feature
33
- default_enabled: Whether enabled by default for new plans
34
- """
35
- id: str
36
- name: str
37
- category: FeatureCategory
38
- description: str
39
- default_enabled: bool = True
40
- memory_cost_mb: int = 5 # Estimated RAM usage in MB
41
-
42
-
43
-
44
- # =============================================================================
45
- # FEATURE REGISTRY - ADD NEW FEATURES HERE
46
- # =============================================================================
47
- # When adding new financial model outputs, add a Feature entry below.
48
- # It will automatically appear in the admin console under the correct category.
49
- # =============================================================================
50
-
51
- FEATURE_REGISTRY: List[Feature] = [
52
- # -------------------------------------------------------------------------
53
- # Core Metrics
54
- # -------------------------------------------------------------------------
55
- Feature(
56
- id="kpi_margins",
57
- name="Profit Margins (Gross/Operating/Net)",
58
- category=FeatureCategory.CORE_METRICS,
59
- description="Core margin KPIs from income statement",
60
- memory_cost_mb=2
61
- ),
62
- Feature(
63
- id="kpi_ratios",
64
- name="Financial Ratios",
65
- category=FeatureCategory.CORE_METRICS,
66
- description="Current ratio, debt-to-equity, quick ratio",
67
- memory_cost_mb=2
68
- ),
69
- Feature(
70
- id="health_score",
71
- name="Health Score Dashboard",
72
- category=FeatureCategory.CORE_METRICS,
73
- description="Overall financial health scoring (stability, profitability, growth, efficiency)"
74
- ),
75
-
76
- # -------------------------------------------------------------------------
77
- # Risk Analysis
78
- # -------------------------------------------------------------------------
79
- Feature(
80
- id="risk_score",
81
- name="Risk Score",
82
- category=FeatureCategory.RISK_ANALYSIS,
83
- description="Aggregate risk scoring (0-100)",
84
- memory_cost_mb=5
85
- ),
86
- Feature(
87
- id="risk_factors",
88
- name="Risk Factor Breakdown",
89
- category=FeatureCategory.RISK_ANALYSIS,
90
- description="Detailed list of identified risk factors"
91
- ),
92
- Feature(
93
- id="liquidity_risk",
94
- name="Liquidity Risk",
95
- category=FeatureCategory.RISK_ANALYSIS,
96
- description="Cash flow and working capital risk assessment"
97
- ),
98
- Feature(
99
- id="solvency_risk",
100
- name="Solvency Risk",
101
- category=FeatureCategory.RISK_ANALYSIS,
102
- description="Long-term debt sustainability analysis"
103
- ),
104
-
105
- # -------------------------------------------------------------------------
106
- # Forecasting
107
- # -------------------------------------------------------------------------
108
- Feature(
109
- id="runway_forecast",
110
- name="Cash Runway Forecast",
111
- category=FeatureCategory.FORECASTING,
112
- description="30/60/90 day cash projections"
113
- ),
114
- Feature(
115
- id="burn_rate",
116
- name="Burn Rate Analysis",
117
- category=FeatureCategory.FORECASTING,
118
- description="Monthly cash burn rate calculation"
119
- ),
120
- Feature(
121
- id="optimization_insights",
122
- name="Optimization Insights",
123
- category=FeatureCategory.FORECASTING,
124
- description="Dead zones, peak premiums, cost optimization"
125
- ),
126
- Feature(
127
- id="budget_variance",
128
- name="Budget Variance Analysis",
129
- category=FeatureCategory.FORECASTING,
130
- description="Target vs actual comparison"
131
- ),
132
-
133
- # -------------------------------------------------------------------------
134
- # AI Intelligence
135
- # -------------------------------------------------------------------------
136
- Feature(
137
- id="ai_cfo",
138
- name="AI CFO Chat",
139
- category=FeatureCategory.AI_INTELLIGENCE,
140
- description="Conversational AI financial advisor",
141
- memory_cost_mb=80
142
- ),
143
- Feature(
144
- id="ai_summary",
145
- name="AI Executive Summary",
146
- category=FeatureCategory.AI_INTELLIGENCE,
147
- description="Auto-generated narrative insights",
148
- memory_cost_mb=60
149
- ),
150
- Feature(
151
- id="geo_insights",
152
- name="Geo-Strategic Insights",
153
- category=FeatureCategory.AI_INTELLIGENCE,
154
- description="Location-based market analysis",
155
- memory_cost_mb=150
156
- ),
157
- Feature(
158
- id="intelligence_card",
159
- name="Strategic Intelligence Card",
160
- category=FeatureCategory.AI_INTELLIGENCE,
161
- description="AI-powered strategic recommendations",
162
- memory_cost_mb=50
163
- ),
164
-
165
- # -------------------------------------------------------------------------
166
- # Interactive Tools
167
- # -------------------------------------------------------------------------
168
- Feature(
169
- id="what_if_slider",
170
- name="What-If Simulator",
171
- category=FeatureCategory.INTERACTIVE,
172
- description="Revenue/cost scenario modeling with sliders"
173
- ),
174
- Feature(
175
- id="interactive_charts",
176
- name="Interactive Charts",
177
- category=FeatureCategory.INTERACTIVE,
178
- description="Zoomable, hoverable data visualizations"
179
- ),
180
- Feature(
181
- id="trend_comparison",
182
- name="Trend Comparison",
183
- category=FeatureCategory.INTERACTIVE,
184
- description="Period-over-period analysis"
185
- ),
186
-
187
- # -------------------------------------------------------------------------
188
- # Exports & Reports
189
- # -------------------------------------------------------------------------
190
- Feature(
191
- id="pdf_export",
192
- name="PDF Report Export",
193
- category=FeatureCategory.EXPORTS,
194
- description="Downloadable PDF financial report"
195
- ),
196
- Feature(
197
- id="pptx_export",
198
- name="PowerPoint Export",
199
- category=FeatureCategory.EXPORTS,
200
- description="Presentation-ready slides"
201
- ),
202
- Feature(
203
- id="csv_export",
204
- name="Data Export (CSV)",
205
- category=FeatureCategory.EXPORTS,
206
- description="Raw data download for further analysis"
207
- ),
208
- ]
209
-
210
-
211
- # =============================================================================
212
- # HELPER FUNCTIONS
213
- # =============================================================================
214
-
215
- def get_all_features() -> List[Feature]:
216
- """Returns all registered features."""
217
- return FEATURE_REGISTRY
218
-
219
-
220
- def get_feature_by_id(feature_id: str) -> Optional[Feature]:
221
- """Get a specific feature by its ID."""
222
- for feature in FEATURE_REGISTRY:
223
- if feature.id == feature_id:
224
- return feature
225
- return None
226
-
227
-
228
- def get_all_feature_ids() -> List[str]:
229
- """Returns list of all feature IDs."""
230
- return [f.id for f in FEATURE_REGISTRY]
231
-
232
-
233
- def get_features_by_category() -> Dict[str, List[Feature]]:
234
- """Returns features grouped by category name."""
235
- result: Dict[str, List[Feature]] = {}
236
- for cat in FeatureCategory:
237
- features = [f for f in FEATURE_REGISTRY if f.category == cat]
238
- if features:
239
- result[cat.value] = features
240
- return result
241
-
242
-
243
- def get_default_enabled_features() -> List[str]:
244
- """Returns IDs of features enabled by default."""
245
- return [f.id for f in FEATURE_REGISTRY if f.default_enabled]
246
-
247
-
248
- def validate_feature_ids(feature_ids: List[str]) -> List[str]:
249
- """
250
- Validates a list of feature IDs against the registry.
251
- Returns list of invalid IDs (empty if all valid).
252
- """
253
- valid_ids = set(get_all_feature_ids())
254
- invalid = [fid for fid in feature_ids if fid not in valid_ids]
255
- return invalid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/migrations.py DELETED
@@ -1,111 +0,0 @@
1
- """
2
- Automatic Schema Migration Utility
3
-
4
- This module runs at startup to ensure database columns match the SQLAlchemy models.
5
- It adds any missing columns automatically, preventing 'UndefinedColumn' errors
6
- in production when new fields are added to models.
7
- """
8
-
9
- from sqlalchemy import inspect, text
10
- from sqlalchemy.engine import Engine
11
- import logging
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def get_model_columns(model_class):
17
- """Extract column definitions from a SQLAlchemy model class."""
18
- from sqlalchemy import Column
19
- columns = {}
20
- for attr_name in dir(model_class):
21
- attr = getattr(model_class, attr_name, None)
22
- if hasattr(attr, 'property') and hasattr(attr.property, 'columns'):
23
- col = attr.property.columns[0]
24
- columns[col.name] = col
25
- return columns
26
-
27
-
28
- def get_db_columns(engine: Engine, table_name: str):
29
- """Get existing column names from the database table."""
30
- inspector = inspect(engine)
31
- try:
32
- return {col['name'] for col in inspector.get_columns(table_name)}
33
- except Exception:
34
- return set()
35
-
36
-
37
- def get_column_type_sql(column):
38
- """Convert SQLAlchemy column type to SQL type string."""
39
- from sqlalchemy import Boolean, Integer, String, DateTime, Text, Float, JSON
40
-
41
- col_type = type(column.type)
42
-
43
- type_map = {
44
- Boolean: "BOOLEAN",
45
- Integer: "INTEGER",
46
- String: "VARCHAR(255)",
47
- DateTime: "TIMESTAMP",
48
- Text: "TEXT",
49
- Float: "FLOAT",
50
- JSON: "JSONB" # PostgreSQL JSON type
51
- }
52
-
53
- # Check for String with specific length
54
- if hasattr(column.type, 'length') and column.type.length:
55
- return f"VARCHAR({column.type.length})"
56
-
57
- return type_map.get(col_type, "TEXT")
58
-
59
-
60
- def get_default_sql(column):
61
- """Get SQL DEFAULT clause for a column."""
62
- if column.default is not None:
63
- default_val = column.default.arg
64
- if isinstance(default_val, bool):
65
- return "DEFAULT FALSE" if not default_val else "DEFAULT TRUE"
66
- elif isinstance(default_val, (int, float)):
67
- return f"DEFAULT {default_val}"
68
- elif isinstance(default_val, str):
69
- return f"DEFAULT '{default_val}'"
70
- elif isinstance(default_val, dict):
71
- return "DEFAULT '{}'"
72
- return ""
73
-
74
-
75
- def run_migrations(engine: Engine):
76
- """
77
- Check all models and add any missing columns to the database.
78
- This runs at application startup.
79
- """
80
- from app.models.user import User, Analysis, Payment
81
- from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
82
-
83
- models = [User, Analysis, Payment, PlanFeatureOverride, PlanUploadLimit]
84
-
85
- for model in models:
86
- table_name = model.__tablename__
87
- model_cols = get_model_columns(model)
88
- db_cols = get_db_columns(engine, table_name)
89
-
90
- if not db_cols:
91
- # Table doesn't exist yet, let create_all handle it
92
- logger.info(f"Table '{table_name}' not found, will be created by create_all()")
93
- continue
94
-
95
- missing_cols = set(model_cols.keys()) - db_cols
96
-
97
- for col_name in missing_cols:
98
- col = model_cols[col_name]
99
- col_type = get_column_type_sql(col)
100
- default_clause = get_default_sql(col)
101
-
102
- sql = f'ALTER TABLE {table_name} ADD COLUMN {col_name} {col_type} {default_clause}'
103
-
104
- try:
105
- with engine.connect() as conn:
106
- conn.execute(text(sql))
107
- conn.commit()
108
- logger.info(f"✓ Added column '{col_name}' to table '{table_name}'")
109
- except Exception as e:
110
- # Column might already exist or other issue
111
- logger.warning(f"Could not add column '{col_name}' to '{table_name}': {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/plan_config.py DELETED
@@ -1,192 +0,0 @@
1
- """
2
- Plan Configuration - Default settings for each subscription plan.
3
-
4
- This module defines upload limits and default feature access per plan.
5
- Admins can override these defaults via the admin console.
6
- """
7
-
8
- from typing import Dict, List, Any
9
- from .feature_registry import get_all_feature_ids
10
-
11
-
12
- # =============================================================================
13
- # PLAN CONFIGURATION
14
- # =============================================================================
15
- # Each plan has:
16
- # - upload_limit: Monthly upload cap
17
- # - is_session: True for guest/anonymous (session-based tracking)
18
- # - features: List of enabled feature IDs, or ["*"] for all features
19
- # =============================================================================
20
-
21
- PLAN_DEFAULTS: Dict[str, Dict[str, Any]] = {
22
- # Guest users on /try page (session-based, no account)
23
- "Guest": {
24
- "upload_limit": 2,
25
- "is_session": True,
26
- "features": [
27
- "kpi_margins",
28
- "health_score",
29
- "risk_score",
30
- "pdf_export"
31
- ]
32
- },
33
-
34
- # Free trial - full Small Business experience for 1 month
35
- "Free Trial": {
36
- "upload_limit": 15,
37
- "is_session": False,
38
- "features": [
39
- "kpi_margins",
40
- "kpi_ratios",
41
- "health_score",
42
- "risk_score",
43
- "risk_factors",
44
- "runway_forecast",
45
- "burn_rate",
46
- "interactive_charts",
47
- "pdf_export"
48
- ]
49
- },
50
-
51
- # Individual plan - $9/month
52
- "Individual": {
53
- "upload_limit": 5,
54
- "is_session": False,
55
- "features": [
56
- "kpi_margins",
57
- "kpi_ratios",
58
- "health_score",
59
- "risk_score",
60
- "risk_factors",
61
- "pdf_export"
62
- ]
63
- },
64
-
65
- # Organization plan - $49/month
66
- "Organization": {
67
- "upload_limit": 10,
68
- "is_session": False,
69
- "features": [
70
- "kpi_margins",
71
- "kpi_ratios",
72
- "health_score",
73
- "risk_score",
74
- "risk_factors",
75
- "liquidity_risk",
76
- "runway_forecast",
77
- "ai_summary",
78
- "interactive_charts",
79
- "pdf_export"
80
- ]
81
- },
82
-
83
- # Small Business plan - $99/month
84
- "Small Business": {
85
- "upload_limit": 15,
86
- "is_session": False,
87
- "features": ["*"] # All features
88
- },
89
-
90
- # Mid Business plan - $249/month
91
- "Mid Business": {
92
- "upload_limit": 25,
93
- "is_session": False,
94
- "features": ["*"] # All features
95
- },
96
-
97
- # Large Business / Enterprise - $499+/month
98
- "Large Business": {
99
- "upload_limit": 50,
100
- "is_session": False,
101
- "features": ["*"] # All features
102
- },
103
-
104
- # Admin users - unlimited access
105
- "Admin": {
106
- "upload_limit": 999999,
107
- "is_session": False,
108
- "features": ["*"]
109
- },
110
-
111
- # Engine Configs (Treated as Plans for feature flags)
112
- "_ENGINE_v1": {
113
- "upload_limit": 0,
114
- "is_session": False,
115
- "features": ["*"]
116
- },
117
- "_ENGINE_v2": {
118
- "upload_limit": 0,
119
- "is_session": False,
120
- "features": [
121
- "kpi_margins", "kpi_ratios", "health_score", "risk_score", "risk_factors",
122
- "runway_forecast", "burn_rate", "interactive_charts", "pdf_export",
123
- "ai_summary", "intelligence_card"
124
- # Note: Geo Insights and AI CFO omitted by default for Lite Engine
125
- ]
126
- }
127
- }
128
-
129
-
130
-
131
- # Special "Plan" names for Engine Feature Configuration
132
- ENGINE_PLANS = ["_ENGINE_v1", "_ENGINE_v2"]
133
-
134
- # Mappings for UI display
135
- ENGINE_DISPLAY_NAMES = {
136
- "_ENGINE_v1": "Visi-Insight-1 (Standard)",
137
- "_ENGINE_v2": "Visi-Insight-2 (Lite)"
138
- }
139
-
140
-
141
- # =============================================================================
142
- # HELPER FUNCTIONS
143
- # =============================================================================
144
-
145
- def get_plan_config(plan_name: str) -> Dict[str, Any]:
146
- """
147
- Get configuration for a specific plan.
148
- Falls back to Individual if plan not found.
149
- """
150
- return PLAN_DEFAULTS.get(plan_name, PLAN_DEFAULTS["Individual"])
151
-
152
-
153
- def get_upload_limit(plan_name: str) -> int:
154
- """Get the monthly upload limit for a plan."""
155
- config = get_plan_config(plan_name)
156
- return config.get("upload_limit", 5)
157
-
158
-
159
- def get_default_features(plan_name: str) -> List[str]:
160
- """
161
- Get list of enabled feature IDs for a plan.
162
- Expands ["*"] to all feature IDs.
163
- """
164
- config = get_plan_config(plan_name)
165
- features = config.get("features", [])
166
-
167
- if "*" in features:
168
- return get_all_feature_ids()
169
-
170
- return features
171
-
172
-
173
- def is_session_based(plan_name: str) -> bool:
174
- """Check if plan uses session-based tracking (for guests)."""
175
- config = get_plan_config(plan_name)
176
- return config.get("is_session", False)
177
-
178
-
179
- def get_all_plans() -> List[str]:
180
- """Returns list of all plan names."""
181
- return list(PLAN_DEFAULTS.keys())
182
-
183
-
184
- def get_billable_plans() -> List[str]:
185
- """Returns plans that are actual subscription tiers (excludes Guest/Admin)."""
186
- return [p for p in PLAN_DEFAULTS.keys() if p not in ("Guest", "Admin")]
187
-
188
-
189
- def get_all_engines() -> List[str]:
190
- """Returns list of engine identifier keys."""
191
- return ENGINE_PLANS
192
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/security.py DELETED
@@ -1,28 +0,0 @@
1
- from datetime import datetime, timedelta
2
- from typing import Optional
3
- from jose import JWTError, jwt
4
- from passlib.context import CryptContext
5
- from app.core.config import settings
6
-
7
- # Config
8
- SECRET_KEY = settings.SECRET_KEY
9
- ALGORITHM = settings.ALGORITHM
10
- ACCESS_TOKEN_EXPIRE_MINUTES = settings.ACCESS_TOKEN_EXPIRE_MINUTES
11
-
12
- pwd_context = CryptContext(schemes=["argon2"], deprecated="auto")
13
-
14
- def verify_password(plain_password, hashed_password):
15
- return pwd_context.verify(plain_password, hashed_password)
16
-
17
- def get_password_hash(password):
18
- return pwd_context.hash(password)
19
-
20
- def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
21
- to_encode = data.copy()
22
- if expires_delta:
23
- expire = datetime.utcnow() + expires_delta
24
- else:
25
- expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
26
- to_encode.update({"exp": expire})
27
- encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
28
- return encoded_jwt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/stripe_config.py DELETED
@@ -1,29 +0,0 @@
1
- import stripe
2
- from app.core.config import settings
3
-
4
- stripe.api_key = settings.STRIPE_SECRET_KEY
5
-
6
- def create_checkout_session(db_user, plan_id: str):
7
- try:
8
- checkout_session = stripe.checkout.Session.create(
9
- customer_email=db_user.email,
10
- client_reference_id=str(db_user.id),
11
- payment_method_types=['card'],
12
- line_items=[
13
- {
14
- 'price': plan_id,
15
- 'quantity': 1,
16
- },
17
- ],
18
- mode='subscription',
19
- success_url=f"{settings.ALLOWED_ORIGINS[0]}/dashboard?session_id={{CHECKOUT_SESSION_ID}}",
20
- cancel_url=f"{settings.ALLOWED_ORIGINS[0]}/pricing",
21
- metadata={
22
- 'user_id': db_user.id,
23
- 'plan_name': 'Business' # Or derive from plan_id
24
- }
25
- )
26
- return checkout_session
27
- except Exception as e:
28
- print(f"Stripe Error: {e}")
29
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/main.py DELETED
@@ -1,124 +0,0 @@
1
- from fastapi import FastAPI
2
- from fastapi.middleware.cors import CORSMiddleware
3
-
4
- app = FastAPI(
5
- title="Visique API",
6
- description="Financial Data Analyzer Backend",
7
- version="0.1.0"
8
- )
9
-
10
- from app.core.config import settings
11
-
12
- # CORS Configuration
13
- # Ensure Vercel domains are allowed even if env vars override config defaults
14
- origins = []
15
- if isinstance(settings.ALLOWED_ORIGINS, list):
16
- origins.extend(settings.ALLOWED_ORIGINS)
17
- else:
18
- origins.append(str(settings.ALLOWED_ORIGINS))
19
-
20
- extra_origins = [
21
- "https://visique-testing.vercel.app",
22
- "https://visique-frontend.vercel.app",
23
- # Specific current previews
24
- "https://visique-testing-7qdi0vaqf-sams-projects-85f65c65.vercel.app",
25
- "https://visique-testing-fky1isli2-sams-projects-85f65c65.vercel.app"
26
- ]
27
-
28
- for origin in extra_origins:
29
- if origin not in origins:
30
- origins.append(origin)
31
-
32
- app.add_middleware(
33
- CORSMiddleware,
34
- allow_origins=origins,
35
- # Allow any Vercel preview domain for this specific project
36
- allow_origin_regex=r"https://visique-testing-.*-sams-projects-85f65c65\.vercel\.app",
37
- allow_credentials=True,
38
- allow_methods=["*"],
39
- allow_headers=["*"],
40
- )
41
-
42
- @app.get("/")
43
- async def root():
44
- return {"message": "Welcome to Visique Financial Analyzer API"}
45
-
46
- @app.get("/health")
47
- async def health_check():
48
- return {"status": "healthy"}
49
-
50
- from app.api.endpoints import router as analysis_router
51
- from app.api.auth import router as auth_router
52
- from app.core.database import engine, Base
53
-
54
- # Run Automatic Schema Migrations (adds missing columns)
55
- from app.core.migrations import run_migrations
56
- run_migrations(engine)
57
-
58
- # Create Tables (for new tables only, migrations handles columns)
59
- Base.metadata.create_all(bind=engine)
60
-
61
- app.include_router(analysis_router, prefix="/api/v1")
62
- app.include_router(auth_router, prefix="/api/v1")
63
-
64
- from app.api.admin import router as admin_router
65
- app.include_router(admin_router, prefix="/api/v1")
66
-
67
- # Mount Static Files for Uploads
68
- from fastapi.staticfiles import StaticFiles
69
- import os
70
-
71
- # Ensure upload directory exists
72
- upload_dir = "uploads"
73
- if not os.path.exists(upload_dir):
74
- os.makedirs(upload_dir)
75
-
76
- # Mount /api/v1/static to the uploads directory
77
- app.mount("/api/v1/static", StaticFiles(directory="uploads"), name="static")
78
-
79
- from sqlalchemy import text
80
- from app.core.database import SessionLocal
81
-
82
- # Startup Migration for V2 Engine Support
83
- @app.on_event("startup")
84
- def run_migrations():
85
- try:
86
- db = SessionLocal()
87
- # Add preferred_engine column if it doesn't exist
88
- db.execute(text("ALTER TABLE users ADD COLUMN IF NOT EXISTS preferred_engine VARCHAR DEFAULT 'v1'"))
89
- db.commit()
90
- db.close()
91
- print("Startup Migration: Verified preferred_engine column.")
92
- except Exception as e:
93
- print(f"Startup Migration Warning: {e}")
94
-
95
- # Keep-Alive Background Task to prevent Render free tier from sleeping
96
- import asyncio
97
- import httpx
98
-
99
- async def keep_alive_task():
100
- """Pings the health endpoint every 5 minutes to prevent cold starts."""
101
- # Wait for initial startup to complete
102
- await asyncio.sleep(60)
103
-
104
- # Get the app URL from environment or use default
105
- app_url = os.environ.get("RENDER_EXTERNAL_URL", "https://visique-backend.onrender.com")
106
- health_url = f"{app_url}/health"
107
-
108
- print(f"[Keep-Alive] Started. Pinging {health_url} every 5 minutes.")
109
-
110
- async with httpx.AsyncClient() as client:
111
- while True:
112
- try:
113
- response = await client.get(health_url, timeout=30)
114
- print(f"[Keep-Alive] Ping successful: {response.status_code}")
115
- except Exception as e:
116
- print(f"[Keep-Alive] Ping failed: {e}")
117
-
118
- # Wait 5 minutes (300 seconds) before next ping
119
- await asyncio.sleep(300)
120
-
121
- @app.on_event("startup")
122
- async def start_keep_alive():
123
- """Starts the keep-alive background task on app startup."""
124
- asyncio.create_task(keep_alive_task())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/models/feature_flags.py DELETED
@@ -1,59 +0,0 @@
1
- """
2
- Feature Flags Model - Admin-managed feature overrides per plan.
3
-
4
- This model stores per-plan feature overrides that take precedence
5
- over the defaults defined in plan_config.py.
6
- """
7
-
8
- from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
9
- from sqlalchemy.orm import relationship
10
- from datetime import datetime
11
- from app.core.database import Base
12
-
13
-
14
- class PlanFeatureOverride(Base):
15
- """
16
- Stores admin overrides for feature availability per plan.
17
-
18
- When checking if a feature is enabled for a plan:
19
- 1. Check if override exists in this table
20
- 2. If yes, use the override value
21
- 3. If no, fall back to plan_config.py defaults
22
- """
23
- __tablename__ = "plan_feature_overrides"
24
-
25
- id = Column(Integer, primary_key=True, index=True)
26
- plan_name = Column(String, index=True, nullable=False)
27
- feature_id = Column(String, index=True, nullable=False)
28
- enabled = Column(Boolean, default=True, nullable=False)
29
-
30
- # Audit fields
31
- updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
32
- updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
33
-
34
- def __repr__(self):
35
- status = "enabled" if self.enabled else "disabled"
36
- return f"<PlanFeatureOverride {self.plan_name}:{self.feature_id}={status}>"
37
-
38
-
39
- class PlanUploadLimit(Base):
40
- """
41
- Stores admin overrides for upload limits per plan.
42
-
43
- When checking upload limit for a plan:
44
- 1. Check if override exists in this table
45
- 2. If yes, use the override value
46
- 3. If no, fall back to plan_config.py defaults
47
- """
48
- __tablename__ = "plan_upload_limits"
49
-
50
- id = Column(Integer, primary_key=True, index=True)
51
- plan_name = Column(String, unique=True, index=True, nullable=False)
52
- upload_limit = Column(Integer, nullable=False)
53
-
54
- # Audit fields
55
- updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
56
- updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
57
-
58
- def __repr__(self):
59
- return f"<PlanUploadLimit {self.plan_name}={self.upload_limit}>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/models/user.py DELETED
@@ -1,63 +0,0 @@
1
- import sqlalchemy
2
- from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text, Boolean
3
- from sqlalchemy.orm import relationship
4
- from datetime import datetime
5
- from app.core.database import Base
6
-
7
- class User(Base):
8
- __tablename__ = "users"
9
-
10
- id = Column(Integer, primary_key=True, index=True)
11
- email = Column(String, unique=True, index=True)
12
- hashed_password = Column(String)
13
- full_name = Column(String, nullable=True)
14
- company_name = Column(String, nullable=True)
15
- plan = Column(String, default="Free")
16
- plan_expires_at = Column(DateTime, nullable=True)
17
- is_admin = Column(Boolean, default=False)
18
- is_super_admin = Column(Boolean, default=False)
19
- created_at = Column(DateTime, default=datetime.utcnow)
20
-
21
- # New Fields for Verification & Profile
22
- visique_id = Column(String, unique=True, index=True, nullable=True) # Generated VSQ-XXXX
23
- ein = Column(String, nullable=True)
24
- address = Column(String, nullable=True)
25
- profile_picture_url = Column(String, nullable=True)
26
- industry = Column(String, default="General")
27
- preferred_engine = Column(String, default="v1") # "v1" (Standard) or "v2" (Lite)
28
-
29
- # Upload Tracking
30
- monthly_upload_count = Column(Integer, default=0)
31
- upload_reset_date = Column(DateTime, default=datetime.utcnow)
32
-
33
- # Custom User-Level Feature Overrides (Add-ons)
34
- custom_features = Column(sqlalchemy.JSON, default={}) # Stores { feature_id: bool }
35
-
36
- analyses = relationship("Analysis", back_populates="owner", cascade="all, delete-orphan")
37
- payments = relationship("Payment", back_populates="user", cascade="all, delete-orphan")
38
-
39
- class Analysis(Base):
40
- __tablename__ = "analyses"
41
-
42
- id = Column(Integer, primary_key=True, index=True)
43
- user_id = Column(Integer, ForeignKey("users.id"))
44
- timestamp = Column(DateTime, default=datetime.utcnow)
45
- company_name = Column(String)
46
- input_filename = Column(String)
47
- stored_filename = Column(String) # Path to saved file on disk
48
- result_json = Column(Text)
49
-
50
- owner = relationship("User", back_populates="analyses")
51
-
52
- class Payment(Base):
53
- __tablename__ = "payments"
54
-
55
- id = Column(Integer, primary_key=True, index=True)
56
- user_id = Column(Integer, ForeignKey("users.id"))
57
- amount = Column(Integer) # In cents or dollars? Let's assume dollars as float or integer cents. Implementation plan said float, but explicit Integer is safer for cents. Let's stick to Float for simplicity with display, or String. Plan said 'amount (float)'. Let's use Float.
58
- status = Column(String) # paid, pending, overdue
59
- date = Column(DateTime, default=datetime.utcnow)
60
- plan_name = Column(String)
61
- invoice_pdf = Column(String, nullable=True) # Path to invoice file
62
-
63
- user = relationship("User", back_populates="payments")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/schemas/chat.py DELETED
@@ -1,14 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List, Optional
3
-
4
- class Message(BaseModel):
5
- role: str # "user" or "assistant"
6
- content: str
7
-
8
- class ChatRequest(BaseModel):
9
- messages: List[Message]
10
- context_filter: Optional[str] = None # e.g. "Balance Sheet", "Risk Report"
11
-
12
- class ChatResponse(BaseModel):
13
- response: str
14
- sources: List[str] = [] # Citations or references to specific data points
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/schemas/financial.py DELETED
@@ -1,47 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
-
4
- # Dynamic Path Resolution for 'financial_model' library
5
- # Structure: root/visique/backend/app/schemas/financial.py -> root/financial_model
6
- # To import 'financial_model' as a package, we need to add 'root' to sys.path
7
- try:
8
- current_file = Path(__file__).resolve()
9
- # Go up 4 levels to 'visique' (backend/app/schemas/financial.py -> schemas -> app -> backend -> visique)
10
- # Then up 1 more to root?
11
- # current_file.parents[0] = schemas
12
- # current_file.parents[1] = app
13
- # current_file.parents[2] = backend
14
- # current_file.parents[3] = visique
15
- # current_file.parents[4] = root (TestAntigrav)
16
-
17
- project_root = current_file.parents[4]
18
-
19
- # Check if 'financial_model' exists in this root
20
- if (project_root / "financial_model").exists():
21
- if str(project_root) not in sys.path:
22
- sys.path.insert(0, str(project_root))
23
- else:
24
- # Fallback for different execution contexts
25
- cwd = Path.cwd()
26
- if (cwd / "financial_model").exists():
27
- if str(cwd) not in sys.path: sys.path.insert(0, str(cwd))
28
- elif (cwd.parent.parent / "financial_model").exists():
29
- unique_root = str(cwd.parent.parent)
30
- if unique_root not in sys.path: sys.path.insert(0, unique_root)
31
-
32
- except Exception as e:
33
- pass # Handle gracefully
34
-
35
- try:
36
- # Now import from the PACKAGE "financial_model"
37
- from financial_model.models import (
38
- PeriodType, Currency,
39
- IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, OperatingMetrics,
40
- DocumentClassification,
41
- FinancialReport, KPIMetrics, BudgetModel, VarianceAnalysis, RiskAnalysis,
42
- HealthScoreBreakdown, GeoAnalysis, RunwayForecast, OptimizationInsight,
43
- StandardizedDataPackage
44
- )
45
- except ImportError:
46
- print("WARNING: Could not import from financial_model library. Ensure project root is in PYTHONPATH.")
47
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/schemas/user.py DELETED
@@ -1,82 +0,0 @@
1
- from pydantic import BaseModel, EmailStr
2
- from typing import Optional, List
3
- from datetime import datetime
4
-
5
- class UserBase(BaseModel):
6
- email: str # was EmailStr
7
-
8
- class UserCreate(UserBase):
9
- password: str
10
- full_name: Optional[str] = None
11
- company_name: Optional[str] = None
12
- admin_key: Optional[str] = None
13
-
14
- class UserLogin(UserBase):
15
- password: str
16
-
17
- class UserResponse(UserBase):
18
- id: int
19
- full_name: Optional[str] = None
20
- company_name: Optional[str] = None
21
- plan: str = "Free"
22
- plan_expires_at: Optional[datetime] = None
23
- is_admin: bool = False
24
- is_super_admin: bool = False
25
- created_at: datetime
26
-
27
- # New Fields
28
- visique_id: Optional[str] = None
29
- ein: Optional[str] = None
30
- address: Optional[str] = None
31
- profile_picture_url: Optional[str] = None
32
- industry: Optional[str] = None
33
- preferred_engine: Optional[str] = "v1"
34
- custom_features: Optional[dict] = None # JSON feature overrides
35
-
36
- class Config:
37
- from_attributes = True
38
-
39
- class Token(BaseModel):
40
- access_token: str
41
- token_type: str
42
-
43
-
44
-
45
- class TokenData(BaseModel):
46
- email: Optional[str] = None
47
-
48
- class AnalysisBase(BaseModel):
49
- company_name: str
50
- input_filename: str
51
- timestamp: datetime
52
- # result_json is heavy, maybe separate detail view
53
-
54
- class AnalysisResponse(AnalysisBase):
55
- id: int
56
- user_id: int
57
-
58
- class Config:
59
- from_attributes = True
60
-
61
- class UpgradeRequest(BaseModel):
62
- plan_name: str
63
- amount: float = 0.0
64
- card_number: str
65
- expiry: str
66
- cvv: str
67
- # New Checkout Fields
68
- address: Optional[str] = None
69
- ein: Optional[str] = None
70
-
71
- class PaymentBase(BaseModel):
72
- amount: float
73
- status: str
74
- plan_name: str
75
- date: datetime
76
-
77
- class PaymentResponse(PaymentBase):
78
- id: int
79
- invoice_pdf: Optional[str] = None
80
-
81
- class Config:
82
- from_attributes = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/__init__.py DELETED
@@ -1,37 +0,0 @@
1
- """
2
- Services Layer
3
-
4
- This package contains all business logic for the Visique platform.
5
-
6
- ## Module Index
7
-
8
- - `feature_service` - Feature flag resolution and plan management
9
- - `analysis/` - Financial analysis and calculations
10
- - `ingestion/` - Data parsing (CSV, PDF)
11
- - `intelligence/` - AI-powered features (Gemini, RAG)
12
- - `reporting/` - Report generation (PDF, PPTX)
13
-
14
- ## Usage Pattern
15
-
16
- ```python
17
- from app.services.feature_service import get_effective_features, check_upload_limit
18
- from app.services.analysis.fundamental import FundamentalAnalyzer
19
- from app.services.intelligence.gemini_service import GeminiService
20
- ```
21
-
22
- ## Design Principles
23
-
24
- 1. **Stateless**: Services don't hold state between calls
25
- 2. **Testable**: All dependencies injected as parameters
26
- 3. **Single Purpose**: Each module handles one domain
27
- 4. **Error Handling**: Raise specific exceptions, don't swallow errors
28
- """
29
-
30
- # Re-export commonly used functions for convenience
31
- # NOTE: Commented out for AI Worker context to avoid heavy dependencies (SQLAlchemy)
32
- # from app.services.feature_service import (
33
- # get_effective_features,
34
- # check_upload_limit,
35
- # increment_upload_count,
36
- # get_effective_upload_limit,
37
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/__init__.py DELETED
@@ -1,54 +0,0 @@
1
- """
2
- Financial Analysis Services
3
-
4
- This package contains the core financial analysis logic.
5
-
6
- ## Module Responsibilities
7
-
8
- | Module | Purpose | Key Functions |
9
- |--------|---------|---------------|
10
- | `fundamental.py` | Main orchestrator | `FundamentalAnalyzer.analyze()` |
11
- | `kpi.py` | KPI calculations | `calculate_margins()`, `calculate_ratios()` |
12
- | `risk.py` | Risk assessment | `calculate_risk_score()`, `identify_risk_factors()` |
13
- | `health_score.py` | Overall health | `compute_health_score()` |
14
- | `growth.py` | Growth metrics | `calculate_growth_rates()` |
15
- | `simulation.py` | What-if modeling | `simulate_scenario()` |
16
-
17
- ## Data Flow
18
-
19
- ```
20
- Raw Data (CSV/PDF)
21
-
22
- Ingestion Layer (parsed dict)
23
-
24
- FundamentalAnalyzer.analyze()
25
- ├── KPI Calculator
26
- ├── Risk Analyzer
27
- ├── Health Score
28
- ├── Growth Metrics
29
- └── (optional) AI Enrichment
30
-
31
- StandardizedDataPackage
32
- ```
33
-
34
- ## Usage
35
-
36
- ```python
37
- from app.services.analysis.fundamental import FundamentalAnalyzer
38
-
39
- analyzer = FundamentalAnalyzer()
40
- result = await analyzer.analyze(parsed_data, user, filename)
41
- # result is a StandardizedDataPackage (Pydantic model)
42
- ```
43
-
44
- ## Adding New Analysis Modules
45
-
46
- 1. Create new file in this directory (e.g., `budget.py`)
47
- 2. Define calculation functions with type hints
48
- 3. Import and call from `FundamentalAnalyzer.analyze()`
49
- 4. Add result to `StandardizedDataPackage` schema
50
- 5. (Optional) Register as feature in `feature_registry.py`
51
- """
52
-
53
- # Re-export main analyzer for convenience
54
- from app.services.analysis.fundamental import FundamentalAnalyzer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/engine_lite.py DELETED
@@ -1,48 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
- from typing import List, Optional, Dict, Any
4
- from app.schemas.financial import FinancialReport, BudgetModel
5
-
6
- # Ensure path to financial_model
7
- try:
8
- current_file = Path(__file__).resolve()
9
- project_root = current_file.parents[4]
10
- if (project_root / "financial_model").exists():
11
- if str(project_root) not in sys.path:
12
- sys.path.insert(0, str(project_root))
13
- except Exception:
14
- pass
15
-
16
- try:
17
- from financial_model.core import FinancialAnalyzer
18
- except ImportError:
19
- # Fallback
20
- sys.path.insert(0, "../../../../../")
21
- from financial_model.core import FinancialAnalyzer
22
-
23
- class LiteAnalyzer:
24
- """
25
- Visi-Insight-2 (Lite Engine)
26
- Optimized for memory-constrained environments.
27
- - No External API calls (GeoService removed)
28
- - No Heavy Simulation (if added in future)
29
- - Pure Mathematical Analysis only
30
- """
31
- @staticmethod
32
- def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
33
-
34
-
35
- # Run Pure Math Analysis
36
- analyzer = FinancialAnalyzer(report)
37
- results = analyzer.run_full_analysis(budget, comparisons, user_address)
38
-
39
- # Tag result as Lite
40
- results['meta'] = {
41
- "engine": "Visi-Insight-2 (Lite)",
42
- "optimized": True
43
- }
44
-
45
- # Explicitly exclude heavy/external modules like GeoService
46
- results['geo_analysis'] = None
47
-
48
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/factory.py DELETED
@@ -1,18 +0,0 @@
1
- from app.models.user import User
2
- from app.services.analysis.fundamental import FundamentalAnalyzer
3
- from app.services.analysis.engine_lite import LiteAnalyzer
4
-
5
- class AnalysisFactory:
6
- @staticmethod
7
- def get_analyzer(user: User):
8
- """
9
- Returns the appropriate analyzer class based on user preference.
10
- Defaults to Standard (V1) if not specified.
11
- """
12
- # Feature Flag / Engine Selection
13
- engine_pref = getattr(user, 'preferred_engine', 'v1')
14
-
15
- if engine_pref == 'v2':
16
- return LiteAnalyzer
17
- else:
18
- return FundamentalAnalyzer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/fundamental.py DELETED
@@ -1,75 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
- from typing import List, Optional, Dict, Any
4
-
5
- # Ensure project root is in path so we can import 'financial_model' package
6
- try:
7
- current_file = Path(__file__).resolve()
8
- project_root = current_file.parents[4]
9
- if (project_root / "financial_model").exists():
10
- if str(project_root) not in sys.path:
11
- sys.path.insert(0, str(project_root))
12
- except Exception:
13
- pass
14
-
15
- from app.schemas.financial import (
16
- FinancialReport,
17
- BudgetModel,
18
- StandardizedDataPackage
19
- )
20
- # Import Core Logic from Library Package
21
- try:
22
- from financial_model.core import FinancialAnalyzer
23
- except ImportError:
24
- # If path setup failed, try forcing the path
25
- sys.path.insert(0, "../../../../../")
26
- from financial_model.core import FinancialAnalyzer
27
-
28
- class FundamentalAnalyzer:
29
- @staticmethod
30
- def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
31
-
32
- """
33
- Main entry point for analysis.
34
- Delegates core logic to the independent 'financial_model' library.
35
- Enhances result with external services (GeoService).
36
- """
37
- # 1. Run Pure Financial Analysis (Library)
38
- analyzer = FinancialAnalyzer(report)
39
- results = analyzer.run_full_analysis(budget, comparisons, user_address)
40
-
41
- # 2. Inject External Services (Geo Intelligence)
42
- # This keeps the library pure and the backend handling integration
43
-
44
- geo_analysis = None
45
- analysis_address = None
46
- is_own_company = False
47
-
48
- if hasattr(report, 'company_address') and report.company_address:
49
- analysis_address = report.company_address
50
- if user_address and user_address.lower().strip() == report.company_address.lower().strip():
51
- is_own_company = True
52
- elif user_address:
53
- analysis_address = user_address
54
- is_own_company = True
55
- else:
56
- analysis_address = f"{report.company_name} Location"
57
-
58
- if "geo_insights" in enabled_features and analysis_address:
59
- try:
60
- from app.services.intelligence.geo_service import GeoService
61
- geo_analysis = GeoService.analyze_location(
62
- analysis_address,
63
- report.metrics.industry,
64
- is_own_company=is_own_company,
65
- company_name=report.company_name
66
- )
67
- except ImportError:
68
- print("Warning: GeoService not available.")
69
- except Exception as e:
70
- print(f"Error in GeoService: {e}")
71
-
72
- if geo_analysis:
73
- results['geo_analysis'] = geo_analysis
74
-
75
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/growth.py DELETED
@@ -1,26 +0,0 @@
1
- from app.schemas.financial import FinancialReport
2
-
3
- class GrowthAnalyzer:
4
- @staticmethod
5
- def analyze_growth_potential(report: FinancialReport) -> str:
6
- """
7
- A modular analyzer that looks for growth signals.
8
- """
9
- signals = []
10
-
11
- # In a real model, this would compare current vs previous periods.
12
- # Since we only have one period in the standard import, we use heuristics or "Time Series" placeholder logic.
13
-
14
- income = report.income_statement
15
-
16
- if income.revenue > 1_000_000:
17
- signals.append("High Volume Business: Revenue > $1M suggests established market presence.")
18
-
19
- if income.operating_income and income.revenue:
20
- if (income.operating_income / income.revenue) > 0.20:
21
- signals.append("Scalable Model: Operating margins > 20% indicate high growth potential.")
22
-
23
- if not signals:
24
- return "Growth Potential: Stable / Needs more historical data."
25
-
26
- return "Growth Potential: " + " ".join(signals)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/health_score.py DELETED
@@ -1,46 +0,0 @@
1
- from app.schemas.financial import KPIMetrics, HealthScoreBreakdown
2
-
3
- class HealthScoreAnalyzer:
4
- @staticmethod
5
- def calculate(metrics: KPIMetrics) -> HealthScoreBreakdown:
6
- # 1. Stability (Liquidity/Debt) - Max 25
7
- stability = 0
8
- if metrics.current_ratio:
9
- if metrics.current_ratio > 1.5: stability += 15
10
- elif metrics.current_ratio > 1.0: stability += 10
11
- if metrics.debt_to_equity:
12
- if metrics.debt_to_equity < 1.0: stability += 10
13
- elif metrics.debt_to_equity < 2.0: stability += 5
14
- else:
15
- # Assume acceptable if no debt info
16
- stability += 10
17
-
18
- # 2. Profitability (Margins) - Max 35
19
- profitability = 0
20
- if metrics.net_margin:
21
- if metrics.net_margin > 15: profitability += 15
22
- elif metrics.net_margin > 5: profitability += 10
23
- elif metrics.net_margin > 0: profitability += 5
24
- if metrics.gross_margin:
25
- if metrics.gross_margin > 40: profitability += 10
26
- elif metrics.gross_margin > 20: profitability += 5
27
- if metrics.roe:
28
- if metrics.roe > 15: profitability += 10
29
-
30
- # 3. Growth (Placeholder / Revenue Trajectory) - Max 20
31
- # In single snapshot, we check generic health markers
32
- growth = 10 # Baseline
33
-
34
- # 4. Efficiency - Max 20
35
- efficiency = 10 # Baseline
36
- if metrics.dso and metrics.dso < 45: efficiency += 10
37
-
38
- total = min(100, stability + profitability + growth + efficiency)
39
-
40
- return HealthScoreBreakdown(
41
- stability=stability,
42
- profitability=profitability,
43
- growth=growth,
44
- efficiency=efficiency,
45
- total_score=total
46
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/kpi.py DELETED
@@ -1,56 +0,0 @@
1
- from app.schemas.financial import IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics
2
-
3
- class KPIAnalyzer:
4
- @staticmethod
5
- def calculate_metrics(report: 'FinancialReport') -> KPIMetrics:
6
- income = report.income_statement
7
- balance = report.balance_sheet
8
-
9
- metrics = KPIMetrics()
10
-
11
- # Profitability
12
- rev = income.revenue or 1.0 # Avoid div by zero
13
-
14
- metrics.gross_margin = (income.gross_profit / rev) * 100
15
- metrics.operating_margin = (income.operating_income / rev) * 100
16
- metrics.net_margin = (income.net_income / rev) * 100
17
-
18
- # Liquidity
19
- curr_liab = balance.total_current_liabilities or 1.0
20
- metrics.current_ratio = balance.total_current_assets / curr_liab
21
-
22
- # Solvency
23
- equity = balance.total_equity or 1.0
24
- if balance.total_liabilities:
25
- metrics.debt_to_equity = balance.total_liabilities / equity
26
-
27
- metrics.roe = (income.net_income / equity) * 100
28
-
29
- # Efficiency
30
- daily_sales = rev / 365
31
- if daily_sales > 0 and balance.accounts_receivable:
32
- metrics.dso = balance.accounts_receivable / daily_sales
33
-
34
- # Restaurant / Service Specific
35
- # Prime Cost = (COGS + Payroll) / Revenue
36
- metrics.prime_cost = ((income.cogs + income.payroll_expenses) / rev) * 100
37
-
38
- # Extracted or Calculated Extra Metrics
39
- # 1. Restaurant Margin
40
- if "extracted_restaurant_margin" in report.metadata:
41
- try:
42
- metrics.restaurant_margin = float(report.metadata["extracted_restaurant_margin"])
43
- except:
44
- pass
45
-
46
- # 2. Effective Tax Rate
47
- if "extracted_effective_tax_rate" in report.metadata:
48
- try:
49
- metrics.effective_tax_rate = float(report.metadata["extracted_effective_tax_rate"])
50
- except:
51
- pass
52
- elif income.taxes > 0 and income.net_income > 0:
53
- pre_tax = income.net_income + income.taxes
54
- metrics.effective_tax_rate = (income.taxes / pre_tax) * 100
55
-
56
- return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/risk.py DELETED
@@ -1,57 +0,0 @@
1
- from typing import List
2
- from app.schemas.financial import KPIMetrics, RiskAnalysis
3
-
4
- class RiskAnalyzer:
5
- @staticmethod
6
- def analyze(metrics: KPIMetrics, balance_cash: float = 0.0, monthly_burn: float = 0.0) -> RiskAnalysis:
7
- score = 100.0
8
- factors = []
9
- liquidity = "Low Risk" # Default assumes good
10
- solvency = "Low Risk"
11
-
12
- # 1. Liquidity Risk (Current Ratio)
13
- if metrics.current_ratio:
14
- if metrics.current_ratio < 1.0:
15
- score -= 20
16
- factors.append("Critical: Current Ratio < 1.0 (Liquidity Issue)")
17
- liquidity = "Critical"
18
- elif metrics.current_ratio < 1.5:
19
- score -= 10
20
- factors.append("Warning: Current Ratio < 1.5")
21
- liquidity = "Medium"
22
- else:
23
- factors.append("Unknown: Missing Current Ratio data")
24
-
25
- # 2. Solvency Risk (Debt to Equity)
26
- if metrics.debt_to_equity:
27
- if metrics.debt_to_equity > 2.0:
28
- score -= 15
29
- factors.append("High Leverage: Debt/Equity > 2.0")
30
- solvency = "High Risk"
31
- elif metrics.debt_to_equity > 1.0:
32
- solvency = "Medium Risk"
33
-
34
- # 3. Profitability Risk
35
- if metrics.net_margin and metrics.net_margin < 0:
36
- score -= 25
37
- factors.append("Loss Making: Negative Net Margin")
38
-
39
- # 4. Burn Rate (Runway)
40
- runway_months = None
41
- if monthly_burn > 0:
42
- runway_months = balance_cash / monthly_burn
43
- if runway_months < 3:
44
- score -= 25
45
- factors.append(f"CRITICAL: Low Cash Runway ({runway_months:.1f} months)")
46
- liquidity = "Critical"
47
- elif runway_months < 6:
48
- score -= 10
49
- factors.append(f"Warning: Cash Runway < 6 months ({runway_months:.1f} months)")
50
-
51
- return RiskAnalysis(
52
- risk_score=max(0.0, score),
53
- risk_factors=factors,
54
- liquidity_risk=liquidity,
55
- solvency_risk=solvency,
56
- burn_rate_months=runway_months
57
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/analysis/simulation.py DELETED
@@ -1,67 +0,0 @@
1
- from app.schemas.financial import FinancialReport, StandardizedDataPackage, KPIMetrics, RiskAnalysis, IncomeStatementStandard
2
- from app.services.analysis.kpi import KPIAnalyzer
3
- from app.services.analysis.risk import RiskAnalyzer
4
- from app.services.analysis.health_score import HealthScoreAnalyzer
5
- from app.services.analysis.fundamental import FundamentalAnalyzer
6
- import copy
7
-
8
- class SimulationService:
9
- @staticmethod
10
- def run_simulation(
11
- original_data: FinancialReport,
12
- delta_revenue_percent: float = 0.0,
13
- delta_cogs_percent: float = 0.0,
14
- delta_payroll_percent: float = 0.0,
15
- delta_marketing_percent: float = 0.0,
16
- delta_fixed_costs_percent: float = 0.0
17
- ) -> StandardizedDataPackage:
18
- """
19
- Runs a What-If scenario on the financial data.
20
- Delta percentages are passed as floats (e.g., 10.0 for +10%).
21
- """
22
-
23
- # Deep copy to avoid mutating original
24
- simulated_report = copy.deepcopy(original_data)
25
- income = simulated_report.income_statement
26
-
27
- # Apply deltas
28
- if delta_revenue_percent != 0:
29
- income.revenue *= (1 + delta_revenue_percent / 100)
30
-
31
- if delta_cogs_percent != 0:
32
- income.cogs *= (1 + delta_cogs_percent / 100)
33
-
34
- if delta_payroll_percent != 0:
35
- income.payroll_expenses *= (1 + delta_payroll_percent / 100)
36
-
37
- if delta_marketing_percent != 0:
38
- income.marketing_expenses *= (1 + delta_marketing_percent / 100)
39
-
40
- if delta_fixed_costs_percent != 0:
41
- income.rent_expense *= (1 + delta_fixed_costs_percent / 100)
42
- income.other_operating_expenses *= (1 + delta_fixed_costs_percent / 100)
43
-
44
- # Re-calculate dependent fields
45
- # Note: In a real complex model, variable costs might scale with revenue automatically.
46
- # Here we assume structure stays static unless explicitly modified.
47
-
48
- # Re-run Full Analysis (Phase 3 Update)
49
- # Instead of calling individual analyzers, call the main FundamentalAnalyzer
50
- # This ensures simulated data gets Runway, Optimization, etc.
51
-
52
- full_analysis = FundamentalAnalyzer.analyze(simulated_report)
53
-
54
- # Override insights to show what changed
55
- sim_summary = f"Simulation: Rev {delta_revenue_percent:+.0f}%, COGS {delta_cogs_percent:+.0f}%, Mkt {delta_marketing_percent:+.0f}%, Fixed {delta_fixed_costs_percent:+.0f}%"
56
- full_analysis['insights'].insert(0, sim_summary)
57
-
58
- return StandardizedDataPackage(
59
- raw_data=simulated_report,
60
- kpis=full_analysis['kpis'],
61
- risk_analysis=full_analysis['risk_analysis'],
62
- health_score=full_analysis['health_score'],
63
- insights=full_analysis['insights'],
64
- recommendations=full_analysis['recommendations'],
65
- runway_forecast=full_analysis['runway_forecast'],
66
- optimization_insights=full_analysis['optimization_insights']
67
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/feature_service.py DELETED
@@ -1,306 +0,0 @@
1
- """
2
- Feature Service - Business logic for feature flag management.
3
-
4
- Handles the resolution of feature availability considering:
5
- 1. Admin overrides (from database)
6
- 2. Plan defaults (from plan_config.py)
7
- 3. Feature registry validation
8
- """
9
-
10
- from typing import List, Dict, Optional, Any
11
- from sqlalchemy.orm import Session
12
- from datetime import datetime, timedelta
13
-
14
- from app.core.feature_registry import (
15
- get_all_features,
16
- get_feature_by_id,
17
- get_all_feature_ids,
18
- get_features_by_category,
19
- Feature
20
- )
21
- from app.core.plan_config import (
22
- get_default_features,
23
- get_upload_limit as get_default_upload_limit,
24
- get_all_plans,
25
- get_all_engines,
26
- PLAN_DEFAULTS
27
- )
28
- from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
29
- from app.models.user import User
30
-
31
-
32
- def get_effective_features(db: Session, plan_name: str) -> List[str]:
33
- """
34
- Get the list of enabled feature IDs for a plan,
35
- considering admin overrides.
36
-
37
- Resolution order:
38
- 1. Start with plan defaults from plan_config.py
39
- 2. Apply any overrides from database
40
- """
41
- # Get default features for plan
42
- default_features = set(get_default_features(plan_name))
43
-
44
- # Get all overrides for this plan
45
- overrides = db.query(PlanFeatureOverride).filter(
46
- PlanFeatureOverride.plan_name == plan_name
47
- ).all()
48
-
49
- # Apply overrides
50
- for override in overrides:
51
- if override.enabled:
52
- default_features.add(override.feature_id)
53
- else:
54
- default_features.discard(override.feature_id)
55
-
56
- return list(default_features)
57
-
58
-
59
- def is_feature_enabled(db: Session, plan_name: str, feature_id: str) -> bool:
60
- """Check if a specific feature is enabled for a plan."""
61
- enabled_features = get_effective_features(db, plan_name)
62
- return feature_id in enabled_features
63
-
64
-
65
- def resolve_user_features(db: Session, user: User) -> List[str]:
66
- """
67
- Resolve final feature flags for a user, combining:
68
- 1. Plan Entitlements (Base)
69
- 2. User-Specific Overrides (Add-ons/Removals) -> stored in user.custom_features
70
- 3. Engine Constraints (Hard Limit)
71
-
72
- Returns: List of enabled feature IDs.
73
- """
74
- # 1. Base Plan Features
75
- current_plan = user.plan or "Free"
76
- if user.is_admin:
77
- current_plan = "Admin"
78
-
79
- plan_features = set(get_effective_features(db, current_plan))
80
-
81
- # 2. Apply User Custom Overrides (Add-ons / Removals)
82
- # user.custom_features is a JSON dict { "feature_id": bool }
83
- # Ensure it's a dict (SQLAlchemy JSON might return None if default not applied yet)
84
- custom_map = user.custom_features or {}
85
- if isinstance(custom_map, str):
86
- # Handle case with SQLite where it might be stored as string
87
- import json
88
- try:
89
- custom_map = json.loads(custom_map)
90
- except:
91
- custom_map = {}
92
-
93
- for fid, enabled in custom_map.items():
94
- if enabled:
95
- plan_features.add(fid)
96
- elif fid in plan_features:
97
- plan_features.remove(fid)
98
-
99
- # 3. Apply Engine Constraints (Hardware Limits)
100
- # Default to v1 if not set
101
- engine_pref = getattr(user, "preferred_engine", "v1") or "v1"
102
- engine_key = f"_ENGINE_{engine_pref}"
103
-
104
- # Get engine allowed features
105
- engine_features = set(get_effective_features(db, engine_key))
106
-
107
- # Final Result = (Plan U Custom) INTERSECT Engine
108
- return list(plan_features.intersection(engine_features))
109
-
110
-
111
-
112
- def get_effective_upload_limit(db: Session, plan_name: str) -> int:
113
- """
114
- Get the upload limit for a plan, considering admin overrides.
115
- """
116
- # Check for override
117
- override = db.query(PlanUploadLimit).filter(
118
- PlanUploadLimit.plan_name == plan_name
119
- ).first()
120
-
121
- if override:
122
- return override.upload_limit
123
-
124
- return get_default_upload_limit(plan_name)
125
-
126
-
127
- def get_all_plan_features(db: Session) -> Dict[str, Dict[str, Any]]:
128
- """
129
- Get feature configuration for all plans.
130
- Returns a dict with plan names as keys and feature configs as values.
131
- """
132
- all_feature_ids = get_all_feature_ids()
133
- result = {}
134
-
135
- for plan_name in get_all_plans():
136
- enabled_features = get_effective_features(db, plan_name)
137
- upload_limit = get_effective_upload_limit(db, plan_name)
138
-
139
- result[plan_name] = {
140
- "upload_limit": upload_limit,
141
- "features": {
142
- fid: fid in enabled_features
143
- for fid in all_feature_ids
144
- }
145
- }
146
-
147
- return result
148
-
149
-
150
- def get_feature_matrix(db: Session) -> Dict[str, Any]:
151
- """
152
- Get feature matrix for admin console display.
153
- Includes categories, features, and per-plan enablement.
154
- """
155
- categories = get_features_by_category()
156
- plans = get_all_plans()
157
- engines = get_all_engines()
158
-
159
- # Build matrix
160
- matrix = {}
161
- for cat_name, features in categories.items():
162
- matrix[cat_name] = []
163
- for feature in features:
164
- row = {
165
- "id": feature.id,
166
- "name": feature.name,
167
- "description": feature.description,
168
- "memory_cost_mb": getattr(feature, "memory_cost_mb", 0),
169
- "plans": {},
170
- "engines": {}
171
- }
172
- for plan in plans:
173
- row["plans"][plan] = is_feature_enabled(db, plan, feature.id)
174
- for engine in engines:
175
- row["engines"][engine] = is_feature_enabled(db, engine, feature.id)
176
- matrix[cat_name].append(row)
177
-
178
- return {
179
- "categories": list(categories.keys()),
180
- "plans": plans,
181
- "engines": engines,
182
- "matrix": matrix
183
- }
184
-
185
-
186
- def set_feature_override(
187
- db: Session,
188
- plan_name: str,
189
- feature_id: str,
190
- enabled: bool,
191
- admin_id: Optional[int] = None
192
- ) -> PlanFeatureOverride:
193
- """
194
- Set or update a feature override for a plan.
195
- """
196
- # Validate feature exists
197
- if not get_feature_by_id(feature_id):
198
- raise ValueError(f"Unknown feature ID: {feature_id}")
199
-
200
- # Find or create override
201
- override = db.query(PlanFeatureOverride).filter(
202
- PlanFeatureOverride.plan_name == plan_name,
203
- PlanFeatureOverride.feature_id == feature_id
204
- ).first()
205
-
206
- if override:
207
- override.enabled = enabled
208
- override.updated_by_id = admin_id
209
- else:
210
- override = PlanFeatureOverride(
211
- plan_name=plan_name,
212
- feature_id=feature_id,
213
- enabled=enabled,
214
- updated_by_id=admin_id
215
- )
216
- db.add(override)
217
-
218
- db.commit()
219
- db.refresh(override)
220
- return override
221
-
222
-
223
- def bulk_set_features(
224
- db: Session,
225
- plan_name: str,
226
- feature_states: Dict[str, bool],
227
- admin_id: Optional[int] = None
228
- ) -> int:
229
- """
230
- Bulk update feature states for a plan.
231
- Returns count of updated features.
232
- """
233
- count = 0
234
- for feature_id, enabled in feature_states.items():
235
- set_feature_override(db, plan_name, feature_id, enabled, admin_id)
236
- count += 1
237
- return count
238
-
239
-
240
- def reset_plan_to_defaults(db: Session, plan_name: str) -> int:
241
- """
242
- Remove all overrides for a plan, reverting to defaults.
243
- Returns count of deleted overrides.
244
- """
245
- result = db.query(PlanFeatureOverride).filter(
246
- PlanFeatureOverride.plan_name == plan_name
247
- ).delete()
248
- db.commit()
249
- return result
250
-
251
-
252
- def check_upload_limit(db: Session, user: User) -> Dict[str, Any]:
253
- """
254
- Check if user can upload, considering their plan limit.
255
- Also handles monthly reset.
256
-
257
- Returns:
258
- {
259
- "can_upload": bool,
260
- "uploads_used": int,
261
- "uploads_limit": int,
262
- "uploads_remaining": int,
263
- "reset_date": datetime
264
- }
265
- """
266
- # Check if we need to reset monthly count
267
- now = datetime.utcnow()
268
- if user.upload_reset_date:
269
- days_since_reset = (now - user.upload_reset_date).days
270
- if days_since_reset >= 30:
271
- user.monthly_upload_count = 0
272
- user.upload_reset_date = now
273
- db.commit()
274
- else:
275
- user.upload_reset_date = now
276
- db.commit()
277
-
278
- # Get effective limit
279
- plan = user.plan or "Individual"
280
- if user.is_admin:
281
- plan = "Admin"
282
-
283
- limit = get_effective_upload_limit(db, plan)
284
- used = user.monthly_upload_count or 0
285
- remaining = max(0, limit - used)
286
-
287
- # Calculate next reset
288
- next_reset = user.upload_reset_date + timedelta(days=30) if user.upload_reset_date else now + timedelta(days=30)
289
-
290
- return {
291
- "can_upload": used < limit,
292
- "uploads_used": used,
293
- "uploads_limit": limit,
294
- "uploads_remaining": remaining,
295
- "reset_date": next_reset.isoformat()
296
- }
297
-
298
-
299
- def increment_upload_count(db: Session, user: User) -> int:
300
- """
301
- Increment user's upload count. Call after successful upload.
302
- Returns new count.
303
- """
304
- user.monthly_upload_count = (user.monthly_upload_count or 0) + 1
305
- db.commit()
306
- return user.monthly_upload_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/__init__.py DELETED
@@ -1,57 +0,0 @@
1
- """
2
- Ingestion Layer - File parsing and data extraction.
3
-
4
- This package handles parsing of various financial document formats
5
- and standardizing them into a common FinancialReport schema.
6
-
7
- ## Supported Formats
8
-
9
- | Format | Parser | Description |
10
- |--------|--------|-------------|
11
- | CSV | CSVParser | Comma-separated financial data |
12
- | PDF | HybridPDFParser | Dolphin-v2 + pdfplumber hybrid extraction |
13
- | PDF | PDFParser | Legacy pdfplumber-only parser |
14
- | XLSX/XLS | XLSXParser | Excel workbooks |
15
-
16
- ## PDF Hybrid Architecture
17
-
18
- PDF files are processed by both Dolphin-v2 and pdfplumber:
19
- 1. Dolphin: layout analysis, document classification, element extraction
20
- 2. pdfplumber: gap-filling table + regex extraction
21
- 3. Merge: Dolphin fields take priority, pdfplumber fills gaps
22
-
23
- If Dolphin is not installed, falls back to pdfplumber-only automatically.
24
-
25
- ## Usage
26
-
27
- Use UnifiedParser for automatic format detection:
28
-
29
- ```python
30
- from app.services.ingestion import UnifiedParser
31
-
32
- report = UnifiedParser.parse(file_path, original_filename)
33
- ```
34
-
35
- Or use specific parsers directly:
36
-
37
- ```python
38
- from app.services.ingestion import CSVParser, HybridPDFParser, XLSXParser
39
-
40
- report = CSVParser.parse(file_path)
41
- report = HybridPDFParser.parse(file_path) # Dolphin + pdfplumber
42
- report = XLSXParser.parse(file_path)
43
- ```
44
-
45
- ## Adding New Formats
46
-
47
- 1. Create `parser_xxx.py` with a class implementing `parse(file_path) -> FinancialReport`
48
- 2. Register in `unified_parser.py` SUPPORTED_EXTENSIONS dict
49
- 3. Add import in this `__init__.py`
50
- """
51
-
52
- from app.services.ingestion.unified_parser import UnifiedParser
53
- from app.services.ingestion.parser_csv import CSVParser
54
- from app.services.ingestion.parser_pdf import PDFParser
55
- from app.services.ingestion.parser_dolphin import HybridPDFParser
56
- from app.services.ingestion.parser_xlsx import XLSXParser
57
- from app.services.ingestion.mappings import DataMapper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/dolphin/__init__.py DELETED
@@ -1,158 +0,0 @@
1
- """
2
- Dolphin PDF Extraction Module — Hybrid Architecture.
3
-
4
- Uses ByteDance Dolphin-v2 for advanced document layout analysis,
5
- classification, and element extraction, combined with pdfplumber
6
- for gap-filling and validation.
7
-
8
- ## Quick Check
9
-
10
- ```python
11
- from app.services.ingestion.dolphin import is_dolphin_available, ensure_model_downloaded
12
-
13
- if is_dolphin_available():
14
- from app.services.ingestion.dolphin.client import DolphinClient
15
- client = DolphinClient()
16
- ```
17
- """
18
-
19
- import os
20
- import logging
21
- from typing import Optional
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
- # Default model storage location (relative to backend root)
26
- DEFAULT_MODEL_DIR = os.path.join(
27
- os.path.dirname(os.path.abspath(__file__)),
28
- "..", "..", "..", "..", "models", "dolphin-v2"
29
- )
30
-
31
- _dolphin_available: Optional[bool] = None
32
-
33
-
34
- def _detect_device() -> str:
35
- """Auto-detect best available compute device: cuda > mps > cpu."""
36
- try:
37
- import torch
38
- if torch.cuda.is_available():
39
- logger.info("Dolphin device: CUDA GPU detected")
40
- return "cuda"
41
- elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
42
- logger.info("Dolphin device: Apple MPS (Metal) detected")
43
- return "mps"
44
- except ImportError:
45
- pass
46
- logger.info("Dolphin device: CPU mode")
47
- return "cpu"
48
-
49
-
50
- def _get_model_path() -> str:
51
- """Resolve model path from config or default."""
52
- try:
53
- from app.core.config import settings
54
- if settings.DOLPHIN_MODEL_PATH:
55
- return settings.DOLPHIN_MODEL_PATH
56
- except Exception:
57
- pass
58
- return os.path.abspath(DEFAULT_MODEL_DIR)
59
-
60
-
61
- def is_dolphin_available() -> bool:
62
- """
63
- Check if Dolphin model and dependencies are installed.
64
- Result is cached after first check.
65
- """
66
- global _dolphin_available
67
- if _dolphin_available is not None:
68
- return _dolphin_available
69
-
70
- # If remote API is configured, we consider Dolphin available
71
- # (The remote worker manages the model)
72
- from app.core.config import settings
73
- if settings.DOLPHIN_API_URL:
74
- _dolphin_available = True
75
- return True
76
-
77
- try:
78
- import torch # noqa: F401
79
- import transformers # noqa: F401
80
- from PIL import Image # noqa: F401
81
-
82
- model_path = _get_model_path()
83
- if os.path.isdir(model_path):
84
- # Check for key model files
85
- has_config = os.path.exists(os.path.join(model_path, "config.json"))
86
- has_weights = (
87
- os.path.exists(os.path.join(model_path, "model.safetensors"))
88
- or os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
89
- or any(f.startswith("model-") for f in os.listdir(model_path) if f.endswith(".safetensors"))
90
- )
91
- _dolphin_available = has_config and has_weights
92
- else:
93
- _dolphin_available = False
94
-
95
- except ImportError as e:
96
- logger.debug(f"Dolphin dependencies not installed: {e}")
97
- _dolphin_available = False
98
-
99
- logger.info(f"Dolphin availability: {_dolphin_available}")
100
- return _dolphin_available
101
-
102
-
103
- def ensure_model_downloaded(force: bool = False) -> str:
104
- """
105
- Download Dolphin-v2 model from HuggingFace if not already present.
106
-
107
- Args:
108
- force: If True, re-download even if model exists
109
-
110
- Returns:
111
- Path to the downloaded model directory
112
- """
113
- model_path = _get_model_path()
114
-
115
- if not force and os.path.isdir(model_path):
116
- config_path = os.path.join(model_path, "config.json")
117
- if os.path.exists(config_path):
118
- logger.info(f"Dolphin model already present at {model_path}")
119
- return model_path
120
-
121
- logger.info("Downloading Dolphin-v2 model from HuggingFace...")
122
-
123
- try:
124
- from huggingface_hub import snapshot_download
125
-
126
- os.makedirs(model_path, exist_ok=True)
127
- snapshot_download(
128
- repo_id="ByteDance/Dolphin-v2",
129
- local_dir=model_path,
130
- local_dir_use_symlinks=False,
131
- )
132
- logger.info(f"Dolphin-v2 model downloaded to {model_path}")
133
-
134
- # Invalidate cache so next check picks up the new model
135
- global _dolphin_available
136
- _dolphin_available = None
137
-
138
- return model_path
139
-
140
- except Exception as e:
141
- logger.error(f"Failed to download Dolphin model: {e}")
142
- raise RuntimeError(
143
- f"Dolphin model download failed: {e}. "
144
- "Install huggingface-hub and ensure network access, "
145
- "or manually download to: {model_path}"
146
- ) from e
147
-
148
-
149
- def get_device() -> str:
150
- """Get configured or auto-detected device."""
151
- try:
152
- from app.core.config import settings
153
- device = getattr(settings, "DOLPHIN_DEVICE", "auto")
154
- if device != "auto":
155
- return device
156
- except Exception:
157
- pass
158
- return _detect_device()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/dolphin/classifier.py DELETED
@@ -1,288 +0,0 @@
1
- """
2
- Document Classifier — Identifies financial document types from parsed content.
3
-
4
- Uses Dolphin's structured output (headings, sections, tables) to classify
5
- PDFs into specific financial document categories with confidence scoring.
6
- """
7
-
8
- import re
9
- import logging
10
- from typing import List, Dict, Tuple, Optional
11
- from dataclasses import dataclass, field
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- @dataclass
17
- class DocumentClassification:
18
- """Classification result for a parsed document."""
19
- doc_type: str # Primary document type
20
- confidence: float # 0.0 - 1.0
21
- detected_sections: List[str] = field(default_factory=list)
22
- extraction_method: str = "dolphin_hybrid"
23
- secondary_types: List[str] = field(default_factory=list) # Additional statements found
24
-
25
-
26
- # ---------------------------------------------------------------------------
27
- # Document type signature definitions
28
- # ---------------------------------------------------------------------------
29
-
30
- # Each type maps to: (required_keywords, optional_keywords, min_required_matches)
31
- DOCUMENT_SIGNATURES: Dict[str, Dict] = {
32
- "10-K": {
33
- "keywords": [
34
- "form 10-k", "annual report", "securities and exchange commission",
35
- "fiscal year ended", "10-k", "annual report pursuant",
36
- ],
37
- "sections": [
38
- "consolidated statements of operations",
39
- "consolidated balance sheets",
40
- "consolidated statements of cash flows",
41
- "management's discussion and analysis",
42
- "risk factors",
43
- ],
44
- "min_keyword_matches": 2,
45
- "min_section_matches": 2,
46
- },
47
- "10-Q": {
48
- "keywords": [
49
- "form 10-q", "quarterly report", "securities and exchange commission",
50
- "fiscal quarter", "10-q", "quarterly report pursuant",
51
- ],
52
- "sections": [
53
- "condensed consolidated statements",
54
- "condensed consolidated balance",
55
- "management's discussion",
56
- ],
57
- "min_keyword_matches": 2,
58
- "min_section_matches": 1,
59
- },
60
- "income_statement": {
61
- "keywords": [
62
- "income statement", "statement of operations", "statement of earnings",
63
- "profit and loss", "p&l", "statement of income",
64
- "consolidated statements of operations",
65
- "consolidated statements of income",
66
- ],
67
- "sections": [
68
- "revenue", "net income", "operating income", "gross profit",
69
- "cost of goods sold", "operating expenses",
70
- ],
71
- "min_keyword_matches": 1,
72
- "min_section_matches": 2,
73
- },
74
- "balance_sheet": {
75
- "keywords": [
76
- "balance sheet", "statement of financial position",
77
- "consolidated balance sheets",
78
- ],
79
- "sections": [
80
- "total assets", "total liabilities", "stockholders' equity",
81
- "current assets", "current liabilities", "cash and equivalents",
82
- ],
83
- "min_keyword_matches": 1,
84
- "min_section_matches": 2,
85
- },
86
- "cash_flow_statement": {
87
- "keywords": [
88
- "cash flow", "statement of cash flows",
89
- "consolidated statements of cash flows",
90
- ],
91
- "sections": [
92
- "operating activities", "investing activities",
93
- "financing activities", "net change in cash",
94
- ],
95
- "min_keyword_matches": 1,
96
- "min_section_matches": 2,
97
- },
98
- "bank_statement": {
99
- "keywords": [
100
- "bank statement", "account statement", "transaction history",
101
- "account summary", "statement period", "beginning balance",
102
- "ending balance",
103
- ],
104
- "sections": [
105
- "deposits", "withdrawals", "balance", "transaction date",
106
- ],
107
- "min_keyword_matches": 2,
108
- "min_section_matches": 1,
109
- },
110
- "invoice": {
111
- "keywords": [
112
- "invoice", "bill to", "ship to", "due date", "invoice number",
113
- "purchase order", "payment terms", "amount due",
114
- ],
115
- "sections": [
116
- "subtotal", "tax", "total", "description", "quantity", "unit price",
117
- ],
118
- "min_keyword_matches": 2,
119
- "min_section_matches": 2,
120
- },
121
- "tax_return": {
122
- "keywords": [
123
- "tax return", "form 1040", "form 1120", "form 990",
124
- "internal revenue service", "irs", "taxable income",
125
- "adjusted gross income", "tax liability",
126
- ],
127
- "sections": [
128
- "income", "deductions", "credits", "tax due", "refund",
129
- ],
130
- "min_keyword_matches": 2,
131
- "min_section_matches": 1,
132
- },
133
- }
134
-
135
-
136
- class DocumentClassifier:
137
- """
138
- Classifies financial documents from parsed content.
139
-
140
- Uses a weighted keyword + section matching strategy against
141
- known document type signatures.
142
-
143
- Usage:
144
- classifier = DocumentClassifier()
145
- result = classifier.classify(full_text, dolphin_sections)
146
- """
147
-
148
- @staticmethod
149
- def classify(
150
- text_content: str,
151
- dolphin_sections: Optional[List[Dict]] = None,
152
- dolphin_elements: Optional[list] = None,
153
- ) -> DocumentClassification:
154
- """
155
- Classify the document based on text content and structural elements.
156
-
157
- Args:
158
- text_content: Full extracted text from the document
159
- dolphin_sections: Layout sections from Dolphin (if available)
160
- dolphin_elements: Parsed elements from Dolphin (if available)
161
-
162
- Returns:
163
- DocumentClassification with type, confidence, and detected sections
164
- """
165
- if not text_content:
166
- return DocumentClassification(
167
- doc_type="general_financial",
168
- confidence=0.0,
169
- extraction_method="dolphin_hybrid",
170
- )
171
-
172
- text_lower = text_content.lower()
173
- scores: Dict[str, float] = {}
174
- section_matches: Dict[str, List[str]] = {}
175
-
176
- for doc_type, signature in DOCUMENT_SIGNATURES.items():
177
- score, matched_sections = DocumentClassifier._score_document(
178
- text_lower, signature, dolphin_sections
179
- )
180
- scores[doc_type] = score
181
- section_matches[doc_type] = matched_sections
182
-
183
- # Find best match
184
- if not scores or max(scores.values()) == 0:
185
- return DocumentClassification(
186
- doc_type="general_financial",
187
- confidence=0.1,
188
- extraction_method="dolphin_hybrid",
189
- )
190
-
191
- best_type = max(scores, key=scores.get) # type: ignore[arg-type]
192
- best_score = scores[best_type]
193
-
194
- # Normalize confidence to 0-1 range (max theoretical ~1.0)
195
- confidence = min(best_score / 10.0, 1.0)
196
-
197
- # Find secondary types (other statements detected within the doc)
198
- secondary = [
199
- t for t, s in scores.items()
200
- if s > 2.0 and t != best_type
201
- ]
202
-
203
- return DocumentClassification(
204
- doc_type=best_type,
205
- confidence=round(confidence, 3),
206
- detected_sections=section_matches.get(best_type, []),
207
- extraction_method="dolphin_hybrid",
208
- secondary_types=secondary,
209
- )
210
-
211
- @staticmethod
212
- def _score_document(
213
- text_lower: str,
214
- signature: Dict,
215
- dolphin_sections: Optional[List[Dict]] = None,
216
- ) -> Tuple[float, List[str]]:
217
- """
218
- Score a document against a type signature.
219
-
220
- Returns (score, list_of_matched_sections).
221
- """
222
- keyword_hits = 0
223
- for kw in signature["keywords"]:
224
- if kw in text_lower:
225
- keyword_hits += 1
226
-
227
- section_hits = 0
228
- matched_sections = []
229
- for sec in signature["sections"]:
230
- if sec in text_lower:
231
- section_hits += 1
232
- matched_sections.append(sec)
233
-
234
- # Bonus from Dolphin structural analysis
235
- dolphin_bonus = 0.0
236
- if dolphin_sections:
237
- section_labels = [
238
- s.get("type", "").lower() for s in dolphin_sections
239
- ]
240
- for sec in signature["sections"]:
241
- if any(sec in label for label in section_labels):
242
- dolphin_bonus += 0.5
243
-
244
- # Check minimum thresholds
245
- min_kw = signature["min_keyword_matches"]
246
- min_sec = signature["min_section_matches"]
247
-
248
- if keyword_hits < min_kw and section_hits < min_sec:
249
- return 0.0, matched_sections
250
-
251
- # Weighted score: keywords × 2 + sections × 1.5 + dolphin bonus
252
- score = (keyword_hits * 2.0) + (section_hits * 1.5) + dolphin_bonus
253
-
254
- return score, matched_sections
255
-
256
- @staticmethod
257
- def get_financial_statement_types(classification: DocumentClassification) -> List[str]:
258
- """
259
- Return the list of financial statement types that should be
260
- extracted from this document.
261
-
262
- For a 10-K/10-Q, extract all three statements.
263
- For a standalone statement, extract just that one.
264
- """
265
- comprehensive_types = {"10-K", "10-Q", "general_financial"}
266
-
267
- if classification.doc_type in comprehensive_types:
268
- return ["income", "balance", "cash_flow"]
269
-
270
- type_map = {
271
- "income_statement": ["income"],
272
- "balance_sheet": ["balance"],
273
- "cash_flow_statement": ["cash_flow"],
274
- "bank_statement": ["cash_flow"],
275
- "invoice": ["income"],
276
- "tax_return": ["income"],
277
- }
278
-
279
- base = type_map.get(classification.doc_type, ["income", "balance", "cash_flow"])
280
-
281
- # Add any secondary types detected
282
- for sec_type in classification.secondary_types:
283
- extra = type_map.get(sec_type, [])
284
- for e in extra:
285
- if e not in base:
286
- base.append(e)
287
-
288
- return base
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/dolphin/extractor.py DELETED
@@ -1,336 +0,0 @@
1
- """
2
- Dolphin Extractor — Extracts structured financial data from Dolphin's parsed output.
3
-
4
- Converts Dolphin's Markdown/JSON tables and text elements into
5
- key-value financial data using the existing DataMapper.
6
- """
7
-
8
- import re
9
- import logging
10
- from typing import Dict, List, Any, Optional
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class DolphinExtractor:
16
- """
17
- Extracts financial data from Dolphin's parsed output.
18
-
19
- Works with DolphinPageResult and DolphinElement objects to produce
20
- a flat dict of {field_name: value} pairs ready for FinancialReport
21
- construction.
22
-
23
- Usage:
24
- extractor = DolphinExtractor()
25
- data = extractor.extract(dolphin_result, doc_classification)
26
- """
27
-
28
- @staticmethod
29
- def extract(
30
- dolphin_result, # DolphinDocumentResult
31
- doc_classification=None, # DocumentClassification
32
- ) -> Dict[str, Any]:
33
- """
34
- Extract all financial data from a Dolphin document result.
35
-
36
- Args:
37
- dolphin_result: DolphinDocumentResult from client.parse_document()
38
- doc_classification: Optional classification to guide extraction
39
-
40
- Returns:
41
- Dict of {standardized_field_name: float_value}
42
- """
43
- from app.services.ingestion.mappings import DataMapper
44
-
45
- extracted = {}
46
- tables_data = []
47
- text_content_parts = []
48
-
49
- for page in dolphin_result.pages:
50
- for element in page.elements:
51
- if element.element_type == "table":
52
- table_rows = DolphinExtractor._parse_markdown_table(
53
- element.content
54
- )
55
- tables_data.append(table_rows)
56
- elif element.element_type == "text":
57
- text_content_parts.append(element.content)
58
-
59
- # --- Strategy 1: Table Extraction ---
60
- for table_rows in tables_data:
61
- table_data = DolphinExtractor._extract_from_table_rows(
62
- table_rows, DataMapper
63
- )
64
- # Only overwrite if we haven't seen this field yet
65
- for k, v in table_data.items():
66
- if k not in extracted:
67
- extracted[k] = v
68
-
69
- # --- Strategy 2: Text/Regex Extraction from Dolphin output ---
70
- full_text = "\n".join(text_content_parts)
71
- if full_text:
72
- text_data = DolphinExtractor._extract_from_text(full_text, DataMapper)
73
- for k, v in text_data.items():
74
- if k not in extracted:
75
- extracted[k] = v
76
-
77
- # --- Strategy 3: Full Markdown extraction (catch-all) ---
78
- if dolphin_result.full_markdown:
79
- markdown_data = DolphinExtractor._extract_from_text(
80
- dolphin_result.full_markdown, DataMapper
81
- )
82
- for k, v in markdown_data.items():
83
- if k not in extracted:
84
- extracted[k] = v
85
-
86
- logger.info(
87
- f"Dolphin extracted {len(extracted)} fields from "
88
- f"{len(tables_data)} tables and {len(text_content_parts)} text blocks"
89
- )
90
-
91
- return extracted
92
-
93
- @staticmethod
94
- def extract_company_name(dolphin_result) -> Optional[str]:
95
- """
96
- Attempt to extract company name from Dolphin's parsed output.
97
-
98
- Looks for SEC filing patterns, document headers, and prominent text.
99
- """
100
- if not dolphin_result.pages:
101
- return None
102
-
103
- # Check first page(s) for company name patterns
104
- for page in dolphin_result.pages[:2]:
105
- markdown = page.markdown
106
- if not markdown:
107
- continue
108
-
109
- # SEC Filing: "Exact name of registrant as specified in its charter"
110
- registrant_match = re.search(
111
- r"(?:exact\s+name\s+of\s+registrant|registrant)",
112
- markdown,
113
- re.IGNORECASE,
114
- )
115
- if registrant_match:
116
- # Look for prominent text before this marker
117
- lines = markdown[: registrant_match.start()].strip().split("\n")
118
- for line in reversed(lines[-10:]):
119
- candidate = line.strip().strip("#").strip("*").strip()
120
- if (
121
- len(candidate) > 2
122
- and not _is_boilerplate(candidate)
123
- and any(c.isalpha() for c in candidate)
124
- ):
125
- return candidate[:100]
126
-
127
- # Markdown heading on first page
128
- heading_match = re.search(r"^#+\s+(.+)$", markdown, re.MULTILINE)
129
- if heading_match:
130
- candidate = heading_match.group(1).strip()
131
- if len(candidate) > 2 and not _is_boilerplate(candidate):
132
- return candidate[:100]
133
-
134
- # First non-trivial line
135
- for line in markdown.split("\n")[:30]:
136
- candidate = line.strip().strip("#").strip("*").strip()
137
- if (
138
- len(candidate) > 3
139
- and not _is_boilerplate(candidate)
140
- and any(c.isalpha() for c in candidate)
141
- ):
142
- return candidate[:100]
143
-
144
- return None
145
-
146
- @staticmethod
147
- def extract_fiscal_year(dolphin_result) -> Optional[str]:
148
- """Extract fiscal year/period from Dolphin output."""
149
- if not dolphin_result.full_markdown:
150
- return None
151
-
152
- patterns = [
153
- r"(?:YEAR|PERIOD|FISCAL\s+YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
154
- r"(?:for\s+the\s+year\s+ended)\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
155
- r"DECEMBER\s+31,\s+(\d{4})",
156
- r"(\d{4})\s+(?:annual|fiscal)",
157
- ]
158
-
159
- text = dolphin_result.full_markdown[:5000]
160
- for pattern in patterns:
161
- match = re.search(pattern, text, re.IGNORECASE)
162
- if match:
163
- return match.group(1)
164
-
165
- return None
166
-
167
- # ------------------------------------------------------------------
168
- # Internal helpers
169
- # ------------------------------------------------------------------
170
-
171
- @staticmethod
172
- def _parse_markdown_table(table_text: str) -> List[List[str]]:
173
- """
174
- Parse a Markdown-format table into a list of rows.
175
-
176
- Handles:
177
- | Header1 | Header2 |
178
- |---------|---------|
179
- | val1 | val2 |
180
- """
181
- rows = []
182
- for line in table_text.strip().split("\n"):
183
- line = line.strip()
184
- if not line.startswith("|"):
185
- continue
186
- # Skip separator rows (|---|---|)
187
- if all(re.match(r"^[\s\-:]+$", c) for c in line.split("|") if c.strip()):
188
- continue
189
-
190
- cells = [cell.strip() for cell in line.split("|")]
191
- # Remove empty first/last from leading/trailing pipes
192
- cells = [c for c in cells if c != ""]
193
- if cells:
194
- rows.append(cells)
195
-
196
- return rows
197
-
198
- @staticmethod
199
- def _extract_from_table_rows(
200
- rows: List[List[str]], data_mapper
201
- ) -> Dict[str, float]:
202
- """
203
- Extract financial data from parsed table rows using DataMapper.
204
-
205
- Assumes first column is label, remaining columns are values.
206
- Picks the most recent year column if years are detected in headers.
207
- """
208
- if not rows:
209
- return {}
210
-
211
- data = {}
212
-
213
- # Detect target value column (most recent year)
214
- target_col = _find_target_column(rows)
215
-
216
- # Detect scale multiplier from header text
217
- multiplier = 1.0
218
- header_text = " ".join(" ".join(r) for r in rows[:3]).lower()
219
- if re.search(r"in millions|amounts in millions", header_text):
220
- multiplier = 1_000_000.0
221
- elif re.search(r"in thousands|amounts in thousands|\(in 000s\)", header_text):
222
- multiplier = 1_000.0
223
-
224
- for row in rows:
225
- if len(row) < 2:
226
- continue
227
-
228
- label = row[0]
229
- mapped_field = data_mapper.map_row(label)
230
- if not mapped_field:
231
- continue
232
-
233
- # Get value from target column or first numeric column
234
- val = None
235
- if target_col is not None and target_col < len(row):
236
- val = _clean_financial_value(row[target_col])
237
-
238
- if val is None:
239
- for cell in row[1:]:
240
- val = _clean_financial_value(cell)
241
- if val is not None:
242
- break
243
-
244
- if val is not None:
245
- data[mapped_field] = val * multiplier
246
-
247
- return data
248
-
249
- @staticmethod
250
- def _extract_from_text(
251
- text: str, data_mapper
252
- ) -> Dict[str, float]:
253
- """
254
- Regex-based extraction from unstructured text.
255
-
256
- Catches line items in formats like:
257
- Revenue ............... $1,234,567
258
- Net Income (456,789)
259
- """
260
- data = {}
261
-
262
- for field, aliases in data_mapper.FIELD_MAPPING.items():
263
- if field in data:
264
- continue
265
-
266
- for alias in aliases:
267
- pattern = re.compile(
268
- rf"{re.escape(alias)}[^0-9\-]*?(\(?[\d,]+\.?\d*\)?)",
269
- re.IGNORECASE,
270
- )
271
- match = pattern.search(text)
272
- if match:
273
- val = _clean_financial_value(match.group(1))
274
- if val is not None:
275
- data[field] = val
276
- break
277
-
278
- return data
279
-
280
-
281
- # ---------------------------------------------------------------------------
282
- # Module-level utility functions
283
- # ---------------------------------------------------------------------------
284
-
285
- def _find_target_column(rows: List[List[str]]) -> Optional[int]:
286
- """Find the column index containing the most recent year."""
287
- max_year = 0
288
- target_col = None
289
-
290
- for row in rows[:5]: # Check headers
291
- for idx, cell in enumerate(row):
292
- cell_clean = cell.replace("$", "").strip()
293
- if re.match(r"^\d{4}$", cell_clean):
294
- year = int(cell_clean)
295
- if 2000 < year < 2100 and year > max_year:
296
- max_year = year
297
- target_col = idx
298
-
299
- return target_col
300
-
301
-
302
- def _clean_financial_value(val_str: Optional[str]) -> Optional[float]:
303
- """Convert financial string formats to float."""
304
- if not val_str:
305
- return None
306
-
307
- s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
308
- if not s:
309
- return None
310
-
311
- # Handle parentheses as negative: (123) → -123
312
- if "(" in s and ")" in s:
313
- s = s.replace("(", "-").replace(")", "")
314
-
315
- # Handle em-dash or dash as zero
316
- if s in ("-", "—", "–"):
317
- return 0.0
318
-
319
- try:
320
- return float(s)
321
- except ValueError:
322
- return None
323
-
324
-
325
- _BOILERPLATE_PHRASES = {
326
- "table of contents", "contents", "index", "financial statements",
327
- "consolidated financial statements", "annual report", "quarterly report",
328
- "10-k", "10-q", "form 10-k", "form 10-q", "united states",
329
- "securities and exchange commission", "washington", "d.c.",
330
- "commission file number", "transition report",
331
- }
332
-
333
-
334
- def _is_boilerplate(text: str) -> bool:
335
- """Check if text is a common boilerplate heading."""
336
- return text.strip().lower() in _BOILERPLATE_PHRASES or text.strip().isdigit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/dolphin/remote_client.py DELETED
@@ -1,110 +0,0 @@
1
- """
2
- Remote Dolphin Client — Consumes the Dolphin-as-a-Service API.
3
-
4
- Sends PDF files to the external AI Worker (Hugging Face Space)
5
- and receives structured extraction results.
6
- """
7
-
8
- import os
9
- import httpx
10
- import logging
11
- from typing import Optional, Dict, Any, List
12
- from dataclasses import asdict
13
-
14
- from app.core.config import settings
15
- from app.services.ingestion.dolphin.client import (
16
- DolphinDocumentResult,
17
- DolphinPageResult,
18
- DolphinLayoutResult,
19
- DolphinElement,
20
- )
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class RemoteDolphinClient:
26
- """
27
- Client for the remote Dolphin AI worker service.
28
-
29
- Usage:
30
- client = RemoteDolphinClient(api_url="https://hf.space/...", api_key="...")
31
- result = client.parse_document("report.pdf")
32
- """
33
-
34
- def __init__(
35
- self,
36
- api_url: Optional[str] = None,
37
- api_key: Optional[str] = None,
38
- timeout: int = 300, # 5 minutes for large PDFs
39
- ):
40
- self.api_url = (api_url or settings.DOLPHIN_API_URL).rstrip("/")
41
- self.api_key = api_key or settings.DOLPHIN_API_KEY
42
- self.timeout = timeout
43
-
44
- if not self.api_url:
45
- raise ValueError("DOLPHIN_API_URL must be set for RemoteDolphinClient")
46
-
47
- logger.info(f"Initialized RemoteDolphinClient pointing to {self.api_url}")
48
-
49
- def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
50
- """
51
- Send PDF to remote worker and reconstruct the result object.
52
- """
53
- if not os.path.exists(pdf_path):
54
- logger.error(f"PDF not found: {pdf_path}")
55
- return DolphinDocumentResult(total_pages=0)
56
-
57
- url = f"{self.api_url}/process"
58
- headers = {}
59
- if self.api_key:
60
- headers["Authorization"] = f"Bearer {self.api_key}"
61
-
62
- try:
63
- logger.info(f"Sending {pdf_path} to remote Dolphin worker...")
64
-
65
- with open(pdf_path, "rb") as f:
66
- files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
67
-
68
- with httpx.Client(timeout=self.timeout) as client:
69
- response = client.post(url, files=files, headers=headers)
70
- response.raise_for_status()
71
-
72
- data = response.json()
73
- return self._reconstruct_result(data)
74
-
75
- except httpx.HTTPStatusError as e:
76
- logger.error(f"Remote Dolphin API error: {e.response.text}")
77
- raise RuntimeError(f"Dolphin API failed: {e.response.status_code}") from e
78
- except Exception as e:
79
- logger.error(f"Remote Dolphin client failed: {e}")
80
- raise
81
-
82
- def _reconstruct_result(self, data: Dict[str, Any]) -> DolphinDocumentResult:
83
- """Convert JSON response back to DolphinDocumentResult objects."""
84
- pages = []
85
- for p in data.get("pages", []):
86
- elements = [
87
- DolphinElement(**e) for e in p.get("elements", [])
88
- ]
89
- pages.append(DolphinPageResult(
90
- page_number=p["page_number"],
91
- markdown=p["markdown"],
92
- structured_json=p.get("structured_json", {}),
93
- elements=elements,
94
- ))
95
-
96
- layouts = []
97
- for l in data.get("layouts", []):
98
- layouts.append(DolphinLayoutResult(
99
- page_number=l["page_number"],
100
- sections=l.get("sections", []),
101
- reading_order=l.get("reading_order", []),
102
- doc_type_hint=l.get("doc_type_hint", "unknown"),
103
- ))
104
-
105
- return DolphinDocumentResult(
106
- pages=pages,
107
- layouts=layouts,
108
- full_markdown=data.get("full_markdown", ""),
109
- total_pages=data.get("total_pages", 0),
110
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/mappings.py DELETED
@@ -1,315 +0,0 @@
1
- """
2
- Data Mapper - Field name normalization for financial data.
3
-
4
- Maps various field names from different file formats (CSV, PDF, XLSX)
5
- to standardized internal field names.
6
- """
7
-
8
- from typing import Dict, List, Optional, Tuple
9
-
10
-
11
- class DataMapper:
12
- """
13
- Maps raw field names to standardized internal field names.
14
-
15
- Usage:
16
- field = DataMapper.map_row("Total Revenue") # Returns "revenue"
17
- field = DataMapper.map_row("Accounts Receivable") # Returns "accounts_receivable"
18
- """
19
-
20
- FIELD_MAPPING: Dict[str, List[str]] = {
21
- # =================================================================
22
- # INCOME STATEMENT
23
- # =================================================================
24
- "revenue": [
25
- "revenue", "sales", "gross sales", "total revenue", "net sales",
26
- "total net sales", "net revenue", "total sales", "service revenue",
27
- "product revenue", "subscription revenue", "recurring revenue",
28
- "operating revenue", "revenues, net", "revenues"
29
- ],
30
- "cogs": [
31
- "cogs", "cost of goods", "direct costs", "cost of sales",
32
- "cost of revenue", "cost of goods sold", "cost of products sold",
33
- "cost of services", "direct cost", "cost of merchandise"
34
- ],
35
- "marketing_expenses": [
36
- "marketing", "ad spend", "advertising", "marketing expense",
37
- "promotion", "marketing and advertising", "advertising expense",
38
- "marketing costs", "promotional expense", "customer acquisition"
39
- ],
40
- "payroll_expenses": [
41
- "payroll", "salaries", "wages", "employee costs", "personnel",
42
- "labor", "compensation", "salaries and wages", "employee benefits",
43
- "stock compensation", "share-based compensation", "labor cost",
44
- "wages and salaries", "staff costs"
45
- ],
46
- "rent_expense": [
47
- "rent", "lease", "occupancy", "facilities", "rent expense",
48
- "lease expense", "occupancy costs", "facility costs"
49
- ],
50
- "other_operating_expenses": [
51
- "other expense", "operating expense", "sga", "general and administrative",
52
- "g&a", "selling, general", "pre-opening", "impairment",
53
- "administrative expense", "operating expenses", "other operating",
54
- "research and development", "r&d", "utilities", "insurance"
55
- ],
56
- "depreciation": [
57
- "depreciation", "depreciation expense", "depreciation and amortization"
58
- ],
59
- "amortization": [
60
- "amortization", "amortization expense"
61
- ],
62
- "interest_expense": [
63
- "interest", "interest expense", "finance costs", "interest cost",
64
- "interest and finance charges", "borrowing costs"
65
- ],
66
- "taxes": [
67
- "tax", "income tax", "taxes", "provision for taxes", "income tax expense",
68
- "tax expense", "provision for income taxes"
69
- ],
70
-
71
- # =================================================================
72
- # BALANCE SHEET - ASSETS
73
- # =================================================================
74
- "cash": [
75
- "cash", "bank", "cash and equivalents", "cash & equivalents",
76
- "cash and cash equivalents", "cash on hand", "short-term investments",
77
- "cash, cash equivalents"
78
- ],
79
- "accounts_receivable": [
80
- "accounts receivable", "ar", "receivables", "trade receivables",
81
- "net receivables", "receivables, net", "trade accounts receivable"
82
- ],
83
- "inventory": [
84
- "inventory", "stock", "merchandise", "inventories",
85
- "merchandise inventory", "raw materials"
86
- ],
87
- "prepaid_expenses": [
88
- "prepaid", "prepaid expenses", "other current assets",
89
- "prepaid and other", "prepaids"
90
- ],
91
- "property_plant_equipment": [
92
- "ppe", "fixed assets", "property plant equipment", "equipment",
93
- "property, plant and equipment", "property and equipment",
94
- "net property", "fixed assets, net", "capital assets"
95
- ],
96
- "accumulated_depreciation": [
97
- "accumulated depreciation", "acc depreciation", "less depreciation"
98
- ],
99
- "intangible_assets": [
100
- "intangible assets", "goodwill", "soft assets", "intangibles",
101
- "goodwill and intangibles"
102
- ],
103
-
104
- # =================================================================
105
- # BALANCE SHEET - LIABILITIES
106
- # =================================================================
107
- "accounts_payable": [
108
- "accounts payable", "ap", "payables", "trade payables",
109
- "trade accounts payable"
110
- ],
111
- "accrued_liabilities": [
112
- "accrued liabilities", "accrued expenses", "accruals",
113
- "accrued and other"
114
- ],
115
- "short_term_debt": [
116
- "short term debt", "current portion of debt", "notes payable",
117
- "current debt", "short-term borrowings", "current portion of long-term debt"
118
- ],
119
- "long_term_debt": [
120
- "long term debt", "term loan", "non-current liabilities",
121
- "long-term borrowings", "bonds payable", "notes payable long-term"
122
- ],
123
- "deferred_revenue": [
124
- "deferred revenue", "unearned revenue", "contract liabilities",
125
- "deferred income"
126
- ],
127
- "total_equity": [
128
- "equity", "retained earnings", "shareholders equity", "total equity",
129
- "stockholders equity", "shareholders' equity", "stockholders' equity",
130
- "total shareholders equity", "net worth", "owner equity"
131
- ],
132
-
133
- # =================================================================
134
- # CASH FLOW STATEMENT
135
- # =================================================================
136
- "operating_cash_flow": [
137
- "operating cash flow", "cfo", "cash from operations",
138
- "cash flow from operating activities", "net cash from operating",
139
- "cash generated by operating activities", "operating activities",
140
- "net cash provided by operating", "cash flows from operating"
141
- ],
142
- "capex": [
143
- "capex", "capital expenditure", "purchase of property",
144
- "additions to property", "capital expenditures",
145
- "purchases of property", "property additions"
146
- ],
147
- "investing_cash_flow": [
148
- "investing cash flow", "cash from investing",
149
- "cash flow from investing activities", "investing activities",
150
- "net cash from investing", "cash flows from investing"
151
- ],
152
- "financing_cash_flow": [
153
- "financing cash flow", "cash from financing",
154
- "cash flow from financing activities", "financing activities",
155
- "net cash from financing", "cash flows from financing"
156
- ],
157
-
158
- # =================================================================
159
- # OPERATING METRICS
160
- # =================================================================
161
- "new_customers": ["new customers", "customer additions", "new users"],
162
- "total_transactions": ["transactions", "orders", "total orders"],
163
- "total_seats": ["seats", "licenses", "subscriptions"],
164
- "active_members": ["members", "active count", "active users"],
165
- "restaurant_margin": ["restaurant margin", "store margin"],
166
- "effective_tax_rate": ["effective tax rate", "tax rate"],
167
- "churn_rate": ["churn", "churn rate", "attrition", "cancellation rate"],
168
- "cac": ["cac", "acquisition cost", "customer acquisition cost"],
169
- "ltv": ["ltv", "lifetime value", "cltv", "customer lifetime value"],
170
-
171
- # =================================================================
172
- # DERIVED / SUMMARY ITEMS (often in Excel templates)
173
- # =================================================================
174
- "gross_profit": [
175
- "gross profit", "gross margin", "gross income"
176
- ],
177
- "operating_income": [
178
- "operating income", "operating profit", "ebit", "income from operations"
179
- ],
180
- "net_income": [
181
- "net income", "net profit", "net earnings", "net income attributable"
182
- ],
183
- "ebitda": [
184
- "ebitda", "earnings before interest"
185
- ],
186
- "total_assets": [
187
- "total assets", "assets total"
188
- ],
189
- "total_liabilities": [
190
- "total liabilities", "liabilities total"
191
- ],
192
- }
193
-
194
- # Exclusion rules: (field, [terms that should NOT trigger this field])
195
- EXCLUSIONS: Dict[str, List[str]] = {
196
- "revenue": ["cost", "marketable securities", "deferred"],
197
- "total_equity": ["awards", "liability", "liabilities", "debt"],
198
- "cash": ["non-cash", "noncash"],
199
- "depreciation": ["accum", "accumulated"],
200
- }
201
-
202
- # Field categories for validation
203
- INCOME_FIELDS = [
204
- "revenue", "cogs", "marketing_expenses", "payroll_expenses", "rent_expense",
205
- "other_operating_expenses", "depreciation", "amortization", "interest_expense", "taxes",
206
- "gross_profit", "operating_income", "net_income", "ebitda"
207
- ]
208
-
209
- BALANCE_FIELDS = [
210
- "cash", "accounts_receivable", "inventory", "prepaid_expenses",
211
- "property_plant_equipment", "accumulated_depreciation", "intangible_assets",
212
- "accounts_payable", "accrued_liabilities", "short_term_debt", "long_term_debt",
213
- "deferred_revenue", "total_equity", "total_assets", "total_liabilities"
214
- ]
215
-
216
- CASH_FIELDS = [
217
- "operating_cash_flow", "capex", "investing_cash_flow", "financing_cash_flow"
218
- ]
219
-
220
- @staticmethod
221
- def map_row(row_label: str) -> Optional[str]:
222
- """
223
- Map a raw field label to a standardized field name.
224
-
225
- Args:
226
- row_label: The raw label from the source file
227
-
228
- Returns:
229
- Standardized field name, or None if no match found
230
- """
231
- if not row_label:
232
- return None
233
-
234
- label_clean = str(row_label).lower().strip().replace("_", " ")
235
-
236
- # Direct match check first
237
- for field, aliases in DataMapper.FIELD_MAPPING.items():
238
- if label_clean == field:
239
- return field
240
-
241
- # Fuzzy / keyword matching with longest match wins
242
- best_match_field = None
243
- best_match_len = 0
244
-
245
- for field, aliases in DataMapper.FIELD_MAPPING.items():
246
- for alias in aliases:
247
- if alias in label_clean:
248
- # Check exclusions
249
- if field in DataMapper.EXCLUSIONS:
250
- if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
251
- continue
252
-
253
- # Longest alias match wins (more specific)
254
- if len(alias) > best_match_len:
255
- best_match_len = len(alias)
256
- best_match_field = field
257
-
258
- return best_match_field
259
-
260
- @staticmethod
261
- def map_row_with_confidence(row_label: str) -> Tuple[Optional[str], float]:
262
- """
263
- Map a row label and return confidence score.
264
-
265
- Returns:
266
- Tuple of (field_name, confidence) where confidence is 0.0-1.0
267
- """
268
- if not row_label:
269
- return None, 0.0
270
-
271
- label_clean = str(row_label).lower().strip().replace("_", " ")
272
-
273
- # Exact match = 1.0 confidence
274
- for field, aliases in DataMapper.FIELD_MAPPING.items():
275
- if label_clean == field:
276
- return field, 1.0
277
- for alias in aliases:
278
- if label_clean == alias:
279
- return field, 1.0
280
-
281
- # Partial match = proportional confidence
282
- best_match_field = None
283
- best_confidence = 0.0
284
-
285
- for field, aliases in DataMapper.FIELD_MAPPING.items():
286
- for alias in aliases:
287
- if alias in label_clean:
288
- # Check exclusions
289
- if field in DataMapper.EXCLUSIONS:
290
- if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
291
- continue
292
-
293
- # Confidence based on how much of the label is matched
294
- confidence = len(alias) / len(label_clean)
295
- if confidence > best_confidence:
296
- best_confidence = confidence
297
- best_match_field = field
298
-
299
- return best_match_field, min(best_confidence, 0.95) # Cap at 0.95 for non-exact
300
-
301
- @staticmethod
302
- def get_statement_type(field: str) -> Optional[str]:
303
- """
304
- Determine which financial statement a field belongs to.
305
-
306
- Returns:
307
- "income", "balance", "cash_flow", or None
308
- """
309
- if field in DataMapper.INCOME_FIELDS:
310
- return "income"
311
- elif field in DataMapper.BALANCE_FIELDS:
312
- return "balance"
313
- elif field in DataMapper.CASH_FIELDS:
314
- return "cash_flow"
315
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/parser_csv.py DELETED
@@ -1,127 +0,0 @@
1
- import pandas as pd
2
- import re
3
- from typing import Dict, Any, Optional
4
- from app.schemas.financial import (
5
- FinancialReport,
6
- BalanceSheetStandard,
7
- IncomeStatementStandard,
8
- CashFlowStandard,
9
- OperatingMetrics,
10
- PeriodType,
11
- Currency
12
- )
13
- from datetime import date
14
-
15
- from app.services.ingestion.mappings import DataMapper
16
-
17
- class CSVParser:
18
- @staticmethod
19
- def parse(file_path: str) -> FinancialReport:
20
- df = pd.read_csv(file_path)
21
-
22
- # Logic to handle different CSV structures
23
- # Case 1: Transposed (Item, Value)
24
- # Case 2: Standard (Columns are periods, Rows are Items) -> We take the most recent column
25
-
26
- data_dict = {}
27
-
28
- # Check if columns themselves are headers (Horizontal Format)
29
- # We look for at least 3 matching fields in columns to confirm
30
- matches = 0
31
- for col in df.columns:
32
- if DataMapper.map_row(str(col)):
33
- matches += 1
34
-
35
- if matches >= 3:
36
- # Horizontal Format: Take the last row (most recent data)
37
- # Assumption: columns are fields
38
- last_row = df.iloc[-1]
39
- for col in df.columns:
40
- field = DataMapper.map_row(str(col))
41
- if field:
42
- val_raw = last_row[col]
43
- # Clean value
44
- if isinstance(val_raw, str):
45
- val_clean = re.sub(r'[^\d.-]', '', val_raw)
46
- try: val = float(val_clean)
47
- except: val = 0.0
48
- else:
49
- val = float(val_raw) if pd.notnull(val_raw) else 0.0
50
- data_dict[field] = val
51
-
52
- # Fallback to Vertical (Key-Value) Format
53
- elif len(df.columns) >= 2:
54
- # Assume col 0 is label, col 1 is current period value
55
- for _, row in df.iterrows():
56
- label = str(row[0])
57
- # Try col 1, if nan try col 2? For now strict col 1
58
- val_raw = row[1]
59
-
60
- # Clean value
61
- if isinstance(val_raw, str):
62
- val_clean = re.sub(r'[^\d.-]', '', val_raw)
63
- try: val = float(val_clean)
64
- except: val = 0.0
65
- else:
66
- val = float(val_raw) if pd.notnull(val_raw) else 0.0
67
-
68
- field = DataMapper.map_row(label)
69
- if field:
70
- data_dict[field] = val
71
-
72
- def get(key, default=0.0):
73
- return data_dict.get(key, default)
74
-
75
- income = IncomeStatementStandard(
76
- revenue=get("revenue"),
77
- cogs=get("cogs"),
78
- marketing_expenses=get("marketing_expenses"),
79
- payroll_expenses=get("payroll_expenses"),
80
- rent_expense=get("rent_expense"),
81
- other_operating_expenses=get("other_operating_expenses"),
82
- depreciation=get("depreciation"),
83
- amortization=get("amortization"),
84
- interest_expense=get("interest_expense"),
85
- taxes=get("taxes")
86
- )
87
-
88
- balance = BalanceSheetStandard(
89
- cash=get("cash"),
90
- accounts_receivable=get("accounts_receivable"),
91
- inventory=get("inventory"),
92
- prepaid_expenses=get("prepaid_expenses"),
93
- property_plant_equipment=get("property_plant_equipment"),
94
- accumulated_depreciation=get("accumulated_depreciation"),
95
- intangible_assets=get("intangible_assets"),
96
- accounts_payable=get("accounts_payable"),
97
- accrued_liabilities=get("accrued_liabilities"),
98
- short_term_debt=get("short_term_debt"),
99
- long_term_debt=get("long_term_debt"),
100
- deferred_revenue=get("deferred_revenue"),
101
- total_equity=get("total_equity")
102
- )
103
-
104
- cash_flow = CashFlowStandard(
105
- operating_cash_flow=get("operating_cash_flow"),
106
- capex=get("capex"),
107
- investing_cash_flow=get("investing_cash_flow"),
108
- financing_cash_flow=get("financing_cash_flow")
109
- )
110
-
111
- metrics = OperatingMetrics(
112
- industry='general', # Default, could extract from metadata
113
- new_customers=int(get("new_customers")) if get("new_customers") else None,
114
- total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
115
- total_seats=int(get("total_seats")) if get("total_seats") else None
116
- )
117
-
118
- return FinancialReport(
119
- company_name="Imported Company",
120
- period_end=date.today(),
121
- period_type=PeriodType.ANNUAL,
122
- currency=Currency.USD,
123
- income_statement=income,
124
- balance_sheet=balance,
125
- cash_flow=cash_flow,
126
- metrics=metrics
127
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/parser_dolphin.py DELETED
@@ -1,429 +0,0 @@
1
- """
2
- Hybrid PDF Parser — Combines Dolphin-v2 and pdfplumber for optimal extraction.
3
-
4
- Both engines process every PDF:
5
- Stage 1: Dolphin layout analysis (document structure & reading order)
6
- Stage 2: Document classification (10-K, invoice, bank statement, etc.)
7
- Stage 3: Dolphin element extraction (tables, text, formulas)
8
- Stage 4: pdfplumber gap-fill & validation (tables + regex fallback)
9
- Stage 5: Merge & normalize → FinancialReport
10
- """
11
-
12
- import logging
13
- import re
14
- from typing import Dict, Any, Optional, List
15
- from datetime import date
16
-
17
- from app.schemas.financial import (
18
- FinancialReport,
19
- BalanceSheetStandard,
20
- IncomeStatementStandard,
21
- CashFlowStandard,
22
- OperatingMetrics,
23
- PeriodType,
24
- Currency,
25
- )
26
- from app.services.ingestion.mappings import DataMapper
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
-
31
- class HybridPDFParser:
32
- """
33
- Hybrid parser that combines Dolphin-v2 deep parsing with pdfplumber
34
- gap-filling on every PDF for maximum extraction coverage.
35
-
36
- Implements the same `parse(file_path) -> FinancialReport` interface
37
- as the original PDFParser.
38
- """
39
-
40
- @staticmethod
41
- def parse(file_path: str) -> FinancialReport:
42
- """
43
- Parse a PDF using the hybrid Dolphin + pdfplumber pipeline.
44
-
45
- Stages:
46
- 1. Dolphin: layout + page parsing
47
- 2. Classify: determine document type
48
- 3. Dolphin: extract structured data from elements
49
- 4. pdfplumber: gap-fill with table + regex extraction
50
- 5. Merge: Dolphin data takes priority, pdfplumber fills gaps
51
-
52
- Falls back to pdfplumber-only if Dolphin is unavailable.
53
- """
54
- dolphin_data = {}
55
- pdfplumber_data = {}
56
- classification = None
57
- dolphin_company_name = None
58
- dolphin_fiscal_year = None
59
- extraction_method = "pdfplumber"
60
-
61
- # -----------------------------------------------------------------
62
- # Stage 1-3: Dolphin Extraction
63
- # -----------------------------------------------------------------
64
- try:
65
- from app.services.ingestion.dolphin import is_dolphin_available
66
-
67
- if is_dolphin_available():
68
- logger.info("Dolphin available — running hybrid extraction")
69
- dolphin_data, classification, dolphin_company_name, dolphin_fiscal_year = (
70
- HybridPDFParser._run_dolphin_stages(file_path)
71
- )
72
- extraction_method = "dolphin_hybrid"
73
- else:
74
- logger.info("Dolphin not available — pdfplumber-only mode")
75
- except Exception as e:
76
- logger.warning(f"Dolphin extraction failed, continuing with pdfplumber: {e}")
77
-
78
- # -----------------------------------------------------------------
79
- # Stage 4: pdfplumber Gap-Fill
80
- # -----------------------------------------------------------------
81
- pdfplumber_data, pdfplumber_text = HybridPDFParser._run_pdfplumber(file_path)
82
-
83
- # -----------------------------------------------------------------
84
- # Stage 5: Merge — Dolphin takes priority, pdfplumber fills gaps
85
- # -----------------------------------------------------------------
86
- merged_data = HybridPDFParser._merge_extractions(dolphin_data, pdfplumber_data)
87
-
88
- logger.info(
89
- f"Merged extraction: {len(dolphin_data)} Dolphin fields + "
90
- f"{len(pdfplumber_data)} pdfplumber fields → "
91
- f"{len(merged_data)} total fields"
92
- )
93
-
94
- # -----------------------------------------------------------------
95
- # Build FinancialReport
96
- # -----------------------------------------------------------------
97
- return HybridPDFParser._build_report(
98
- extracted_data=merged_data,
99
- text_content=pdfplumber_text,
100
- file_path=file_path,
101
- extraction_method=extraction_method,
102
- classification=classification,
103
- dolphin_company_name=dolphin_company_name,
104
- dolphin_fiscal_year=dolphin_fiscal_year,
105
- )
106
-
107
- # ==================================================================
108
- # Stage Implementations
109
- # ==================================================================
110
-
111
- @staticmethod
112
- def _run_dolphin_stages(file_path: str):
113
- """Stages 1-3: Dolphin layout, classification, and extraction."""
114
- from app.services.ingestion.dolphin.client import DolphinClient
115
- from app.services.ingestion.dolphin.classifier import DocumentClassifier
116
- from app.services.ingestion.dolphin.extractor import DolphinExtractor
117
-
118
- # Stage 1: Parse entire document
119
- # Use factory to get Local or Remote client
120
- client = DolphinClient.create()
121
- doc_result = client.parse_document(file_path)
122
-
123
- if doc_result.total_pages == 0:
124
- return {}, None, None, None
125
-
126
- # Stage 2: Classify document type
127
- # Collect section info from layouts
128
- all_sections = []
129
- for layout in doc_result.layouts:
130
- all_sections.extend(layout.sections)
131
-
132
- classification = DocumentClassifier.classify(
133
- text_content=doc_result.full_markdown,
134
- dolphin_sections=all_sections,
135
- )
136
-
137
- logger.info(
138
- f"Document classified as '{classification.doc_type}' "
139
- f"(confidence: {classification.confidence:.2f})"
140
- )
141
-
142
- # Stage 3: Extract structured financial data
143
- extracted = DolphinExtractor.extract(doc_result, classification)
144
-
145
- # Also try to extract company name and fiscal year
146
- company_name = DolphinExtractor.extract_company_name(doc_result)
147
- fiscal_year = DolphinExtractor.extract_fiscal_year(doc_result)
148
-
149
- return extracted, classification, company_name, fiscal_year
150
-
151
- @staticmethod
152
- def _run_pdfplumber(file_path: str):
153
- """
154
- Stage 4: pdfplumber extraction — tables + regex.
155
-
156
- Reuses the proven logic from the existing PDFParser.
157
- """
158
- from app.services.ingestion.parser_pdf import PDFParser
159
- import pdfplumber
160
-
161
- extracted_data = {}
162
- text_content = ""
163
-
164
- try:
165
- with pdfplumber.open(file_path) as pdf:
166
- # Statement page locator
167
- statement_pages = PDFParser._find_statement_pages(pdf)
168
-
169
- # Extract from identified statement pages
170
- for stmt_type, page in statement_pages.items():
171
- allowed_fields = None
172
- if stmt_type == "income":
173
- allowed_fields = DataMapper.INCOME_FIELDS
174
- elif stmt_type == "balance":
175
- allowed_fields = DataMapper.BALANCE_FIELDS
176
- elif stmt_type == "cash_flow":
177
- allowed_fields = DataMapper.CASH_FIELDS
178
-
179
- table_data = PDFParser._extract_table_data(page, allowed_fields)
180
- extracted_data.update(table_data)
181
-
182
- # Full text extraction for regex fallback
183
- for page in pdf.pages:
184
- page_text = page.extract_text()
185
- if page_text:
186
- text_content += page_text + "\n"
187
-
188
- # Regex fallback for missing fields
189
- regex_data = PDFParser._extract_via_regex(
190
- text_content, existing_keys=extracted_data.keys()
191
- )
192
- extracted_data.update(regex_data)
193
-
194
- except Exception as e:
195
- logger.warning(f"pdfplumber extraction failed: {e}")
196
-
197
- return extracted_data, text_content
198
-
199
- @staticmethod
200
- def _merge_extractions(
201
- dolphin_data: Dict[str, Any],
202
- pdfplumber_data: Dict[str, Any],
203
- ) -> Dict[str, Any]:
204
- """
205
- Merge Dolphin and pdfplumber extractions.
206
-
207
- Priority: Dolphin fields take precedence.
208
- pdfplumber fills any gaps not covered by Dolphin.
209
- """
210
- merged = dict(dolphin_data) # Start with Dolphin data
211
-
212
- for key, value in pdfplumber_data.items():
213
- if key not in merged:
214
- merged[key] = value
215
- elif merged[key] == 0.0 and value != 0.0:
216
- # If Dolphin gave 0 but pdfplumber found a value, prefer pdfplumber
217
- merged[key] = value
218
-
219
- return merged
220
-
221
- # ==================================================================
222
- # Report Construction (mirrors PDFParser logic)
223
- # ==================================================================
224
-
225
- @staticmethod
226
- def _build_report(
227
- extracted_data: Dict,
228
- text_content: str,
229
- file_path: str,
230
- extraction_method: str,
231
- classification=None,
232
- dolphin_company_name: Optional[str] = None,
233
- dolphin_fiscal_year: Optional[str] = None,
234
- ) -> FinancialReport:
235
- """Build a FinancialReport from merged extracted data."""
236
-
237
- def get(key, default=0.0):
238
- val = extracted_data.get(key)
239
- return val if val is not None else default
240
-
241
- # --- Income Statement ---
242
- revenue = get("revenue")
243
- cogs = get("cogs")
244
- marketing = get("marketing_expenses")
245
- payroll = get("payroll_expenses")
246
- rent = get("rent_expense")
247
- other = get("other_operating_expenses")
248
- depreciation = get("depreciation")
249
- amortization = get("amortization")
250
- interest = get("interest_expense")
251
- taxes = get("taxes")
252
-
253
- op_expenses = marketing + payroll + rent + other
254
- gross_profit = revenue - cogs
255
- ebitda = gross_profit - op_expenses
256
- op_income = ebitda - depreciation - amortization
257
- net_income = op_income - interest - taxes
258
-
259
- income = IncomeStatementStandard(
260
- revenue=revenue, cogs=cogs,
261
- marketing_expenses=marketing, payroll_expenses=payroll,
262
- rent_expense=rent, other_operating_expenses=other,
263
- depreciation=depreciation, amortization=amortization,
264
- interest_expense=interest, taxes=taxes,
265
- operating_expenses=op_expenses, gross_profit=gross_profit,
266
- ebitda=ebitda, operating_income=op_income, net_income=net_income,
267
- )
268
-
269
- # --- Balance Sheet ---
270
- cash = get("cash")
271
- ar = get("accounts_receivable")
272
- inv = get("inventory")
273
- prepaid = get("prepaid_expenses")
274
- ppe = get("property_plant_equipment")
275
- accum_dep = get("accumulated_depreciation")
276
- intangibles = get("intangible_assets")
277
- ap = get("accounts_payable")
278
- accrued = get("accrued_liabilities")
279
- st_debt = get("short_term_debt")
280
- lt_debt = get("long_term_debt")
281
- deferred = get("deferred_revenue")
282
- equity = get("total_equity")
283
-
284
- bs_current_assets = cash + ar + inv + prepaid
285
- bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
286
- bs_current_liab = ap + accrued + st_debt
287
- bs_total_liab = bs_current_liab + lt_debt + deferred
288
-
289
- balance = BalanceSheetStandard(
290
- cash=cash, accounts_receivable=ar, inventory=inv,
291
- prepaid_expenses=prepaid, property_plant_equipment=ppe,
292
- accumulated_depreciation=accum_dep, intangible_assets=intangibles,
293
- accounts_payable=ap, accrued_liabilities=accrued,
294
- short_term_debt=st_debt, long_term_debt=lt_debt,
295
- deferred_revenue=deferred, total_equity=equity,
296
- total_current_assets=bs_current_assets, total_assets=bs_total_assets,
297
- total_current_liabilities=bs_current_liab, total_liabilities=bs_total_liab,
298
- )
299
-
300
- # --- Cash Flow ---
301
- cash_flow = CashFlowStandard(
302
- operating_cash_flow=get("operating_cash_flow"),
303
- capex=get("capex"),
304
- investing_cash_flow=get("investing_cash_flow"),
305
- financing_cash_flow=get("financing_cash_flow"),
306
- net_change_in_cash=get("net_change_in_cash"),
307
- )
308
-
309
- # --- Operating Metrics ---
310
- metrics = OperatingMetrics(
311
- industry="restaurant" if get("restaurant_margin") else "general",
312
- new_customers=int(get("new_customers")) if get("new_customers") else None,
313
- total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
314
- total_seats=int(get("total_seats")) if get("total_seats") else None,
315
- churn_rate=get("churn_rate") if get("churn_rate") else None,
316
- cac=get("cac") if get("cac") else None,
317
- ltv=get("ltv") if get("ltv") else None,
318
- )
319
-
320
- # --- Metadata ---
321
- metadata = {
322
- "extraction_method": extraction_method,
323
- "extracted_restaurant_margin": str(get("restaurant_margin")),
324
- "extracted_effective_tax_rate": str(get("effective_tax_rate")),
325
- }
326
-
327
- if classification:
328
- metadata["document_type"] = classification.doc_type
329
- metadata["classification_confidence"] = str(classification.confidence)
330
- metadata["detected_sections"] = ",".join(classification.detected_sections)
331
-
332
- # --- Company Name ---
333
- company_name = HybridPDFParser._resolve_company_name(
334
- dolphin_name=dolphin_company_name,
335
- text_content=text_content,
336
- file_path=file_path,
337
- )
338
-
339
- # --- Fiscal Year ---
340
- fiscal_year_date = HybridPDFParser._resolve_fiscal_year(
341
- dolphin_year=dolphin_fiscal_year,
342
- text_content=text_content,
343
- )
344
-
345
- return FinancialReport(
346
- company_name=company_name,
347
- period_end=fiscal_year_date,
348
- period_type=PeriodType.ANNUAL,
349
- currency=Currency.USD,
350
- income_statement=income,
351
- balance_sheet=balance,
352
- cash_flow=cash_flow,
353
- metrics=metrics,
354
- metadata=metadata,
355
- )
356
-
357
- # ==================================================================
358
- # Name & Date Resolution
359
- # ==================================================================
360
-
361
- @staticmethod
362
- def _resolve_company_name(
363
- dolphin_name: Optional[str],
364
- text_content: str,
365
- file_path: str,
366
- ) -> str:
367
- """Resolve company name: Dolphin → text heuristics → filename."""
368
- if dolphin_name:
369
- return dolphin_name
370
-
371
- # Reuse the existing PDFParser heuristics
372
- from app.services.ingestion.parser_pdf import PDFParser
373
- # We can't call PDFParser's name extraction directly (it's inline),
374
- # so replicate the core logic:
375
-
376
- lines = text_content.split("\n")
377
- ignored = {
378
- "TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
379
- "CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
380
- "10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
381
- "SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
382
- }
383
-
384
- # SEC filing heuristic
385
- registrant_idx = -1
386
- for i, line in enumerate(lines[:100]):
387
- if "exact name of registrant" in line.lower():
388
- registrant_idx = i
389
- break
390
-
391
- if registrant_idx > 0:
392
- for j in range(registrant_idx - 1, -1, -1):
393
- candidate = lines[j].strip()
394
- if len(candidate) > 2 and not any(ig in candidate.upper() for ig in ignored):
395
- return candidate[:100]
396
-
397
- # First meaningful line
398
- for line in lines[:40]:
399
- candidate = line.strip()
400
- if (
401
- len(candidate) > 2
402
- and not any(ig in candidate.upper() for ig in ignored)
403
- and not candidate.isdigit()
404
- and any(c.isalpha() for c in candidate)
405
- ):
406
- return candidate[:100]
407
-
408
- # Filename fallback
409
- import os
410
- basename = os.path.basename(file_path)
411
- return os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
412
-
413
- @staticmethod
414
- def _resolve_fiscal_year(
415
- dolphin_year: Optional[str],
416
- text_content: str,
417
- ) -> date:
418
- """Resolve fiscal year: Dolphin → text patterns → today."""
419
- # Try Dolphin result first
420
- if dolphin_year:
421
- year_match = re.search(r"\d{4}", dolphin_year)
422
- if year_match:
423
- y = int(year_match.group(0))
424
- if 1990 <= y <= date.today().year + 1:
425
- return date(y, 12, 31)
426
-
427
- # Reuse PDFParser's fiscal year extraction
428
- from app.services.ingestion.parser_pdf import PDFParser
429
- return PDFParser._extract_fiscal_year(text_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/parser_pdf.py DELETED
@@ -1,402 +0,0 @@
1
- import pdfplumber
2
- import re
3
- from typing import Dict, Any, Optional, List
4
- from app.schemas.financial import (
5
- FinancialReport,
6
- BalanceSheetStandard,
7
- IncomeStatementStandard,
8
- CashFlowStandard,
9
- OperatingMetrics,
10
- PeriodType,
11
- Currency
12
- )
13
- from datetime import date
14
- from app.services.ingestion.mappings import DataMapper
15
-
16
- class PDFParser:
17
- @staticmethod
18
- def parse(file_path: str) -> FinancialReport:
19
- extracted_data = {}
20
- text_content = ""
21
-
22
- with pdfplumber.open(file_path) as pdf:
23
- # 1. Statement Locator Strategy (Find Income, Balance, Cash Flow pages)
24
- statement_pages = PDFParser._find_statement_pages(pdf)
25
-
26
- # 2. Extract Data from Tables on those pages
27
- for stmt_type, page in statement_pages.items():
28
- print(f"Processing {stmt_type} on page {page.page_number}")
29
-
30
- # Determine allowed fields based on statement type
31
- allowed_fields = None
32
- if stmt_type == "income":
33
- allowed_fields = DataMapper.INCOME_FIELDS
34
- elif stmt_type == "balance":
35
- allowed_fields = DataMapper.BALANCE_FIELDS
36
- elif stmt_type == "cash_flow":
37
- allowed_fields = DataMapper.CASH_FIELDS
38
-
39
- table_data = PDFParser._extract_table_data(page, allowed_fields)
40
- extracted_data.update(table_data)
41
-
42
- # 3. Global Text Extraction (for Regex Fallback & Metrics)
43
- for page in pdf.pages:
44
- text_content += page.extract_text() + "\n"
45
-
46
- # 4. Fallback / Regex Strategy for missing fields
47
- regex_data = PDFParser._extract_via_regex(text_content, existing_keys=extracted_data.keys())
48
- extracted_data.update(regex_data)
49
-
50
- # 5. Extract Fiscal Year
51
- fiscal_year_date = PDFParser._extract_fiscal_year(text_content)
52
-
53
- # 6. Construct Financial Objects
54
- def get(key, default=0.0):
55
- val = extracted_data.get(key)
56
- if val is None:
57
- return default
58
- return val
59
-
60
- # Calculate Computed Fields
61
- revenue = get("revenue")
62
- cogs = get("cogs")
63
- marketing = get("marketing_expenses")
64
- payroll = get("payroll_expenses")
65
- rent = get("rent_expense")
66
- other = get("other_operating_expenses")
67
- depreciation = get("depreciation")
68
- amortization = get("amortization")
69
- interest = get("interest_expense")
70
- taxes = get("taxes")
71
-
72
- op_expenses = marketing + payroll + rent + other
73
- gross_profit = revenue - cogs
74
- ebitda = gross_profit - op_expenses
75
- op_income = ebitda - depreciation - amortization
76
- net_income = op_income - interest - taxes
77
-
78
- income = IncomeStatementStandard(
79
- revenue=revenue,
80
- cogs=cogs,
81
- marketing_expenses=marketing,
82
- payroll_expenses=payroll,
83
- rent_expense=rent,
84
- other_operating_expenses=other,
85
- depreciation=depreciation,
86
- amortization=amortization,
87
- interest_expense=interest,
88
- taxes=taxes,
89
- # Computed
90
- operating_expenses=op_expenses,
91
- gross_profit=gross_profit,
92
- ebitda=ebitda,
93
- operating_income=op_income,
94
- net_income=net_income
95
- )
96
-
97
- # Balance Sheet Computed
98
- cash = get("cash")
99
- ar = get("accounts_receivable")
100
- inv = get("inventory")
101
- prepaid = get("prepaid_expenses")
102
- ppe = get("property_plant_equipment")
103
- accum_dep = get("accumulated_depreciation")
104
- intangibles = get("intangible_assets")
105
-
106
- ap = get("accounts_payable")
107
- accrued = get("accrued_liabilities")
108
- st_debt = get("short_term_debt")
109
- lt_debt = get("long_term_debt")
110
- deferred = get("deferred_revenue")
111
- equity = get("total_equity")
112
-
113
- bs_current_assets = cash + ar + inv + prepaid
114
- bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
115
- bs_current_liab = ap + accrued + st_debt
116
- bs_total_liab = bs_current_liab + lt_debt + deferred
117
-
118
- balance = BalanceSheetStandard(
119
- cash=cash,
120
- accounts_receivable=ar,
121
- inventory=inv,
122
- prepaid_expenses=prepaid,
123
- property_plant_equipment=ppe,
124
- accumulated_depreciation=accum_dep,
125
- intangible_assets=intangibles,
126
- accounts_payable=ap,
127
- accrued_liabilities=accrued,
128
- short_term_debt=st_debt,
129
- long_term_debt=lt_debt,
130
- deferred_revenue=deferred,
131
- total_equity=equity,
132
- # Computed
133
- total_current_assets=bs_current_assets,
134
- total_assets=bs_total_assets,
135
- total_current_liabilities=bs_current_liab,
136
- total_liabilities=bs_total_liab
137
- )
138
-
139
- cash_flow = CashFlowStandard(
140
- operating_cash_flow=get("operating_cash_flow"),
141
- capex=get("capex"),
142
- investing_cash_flow=get("investing_cash_flow"),
143
- financing_cash_flow=get("financing_cash_flow"),
144
- net_change_in_cash=get("net_change_in_cash")
145
- )
146
-
147
- metrics = OperatingMetrics(
148
- industry='restaurant' if get("restaurant_margin") else 'general',
149
- new_customers=int(get("new_customers")) if get("new_customers") else None,
150
- total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
151
- total_seats=int(get("total_seats")) if get("total_seats") else None,
152
- churn_rate=get("churn_rate") if get("churn_rate") else None,
153
- cac=get("cac") if get("cac") else None,
154
- ltv=get("ltv") if get("ltv") else None,
155
- )
156
-
157
- metadata = {
158
- "extracted_restaurant_margin": str(get("restaurant_margin")),
159
- "extracted_effective_tax_rate": str(get("effective_tax_rate"))
160
- }
161
-
162
- # Company Name Heuristic
163
- company_name = "Detected via OCR"
164
- name_found = False
165
-
166
- # 1. SEC Filing Heuristic
167
- registrant_marker = "Exact name of registrant"
168
- registrant_index = -1
169
-
170
- extracted_lines = text_content.split('\n')
171
-
172
- for i, line in enumerate(extracted_lines[:100]):
173
- if registrant_marker.lower() in line.lower():
174
- registrant_index = i
175
- break
176
-
177
- if registrant_index > 0:
178
- for j in range(registrant_index - 1, -1, -1):
179
- candidate = extracted_lines[j].strip()
180
- if len(candidate) > 2:
181
- if "FORM" not in candidate.upper() and "UNITED STATES" not in candidate.upper():
182
- company_name = candidate
183
- name_found = True
184
- break
185
-
186
- # 2. Top-of-page Heuristic
187
- if not name_found:
188
- ignored_names = [
189
- "TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
190
- "CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
191
- "10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
192
- "SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
193
- "COMMISSION FILE NUMBER", "TRANSITION REPORT", "QUARTERLY REPORT PURSUANT"
194
- ]
195
-
196
- for line in extracted_lines[:40]:
197
- candidate = line.strip()
198
- if (len(candidate) > 2
199
- and not any(ignore in candidate.upper() for ignore in ignored_names)
200
- and not candidate.isdigit()
201
- and not "FILE NUMBER" in candidate.upper()):
202
-
203
- if any(c.isalpha() for c in candidate):
204
- company_name = candidate[:100]
205
- name_found = True
206
- break
207
-
208
- # 3. Filename Fallback
209
- if not name_found or company_name == "Detected via OCR":
210
- import os
211
- basename = os.path.basename(file_path)
212
- company_name = os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
213
-
214
- return PDFParser._finalize_report(company_name, income, balance, cash_flow, metrics, metadata, fiscal_year_date)
215
-
216
- @staticmethod
217
- def _finalize_report(name, income, balance, cash, metrics, meta, period_end):
218
- """Helper to construct the final object"""
219
- return FinancialReport(
220
- company_name=name,
221
- period_end=period_end,
222
- period_type=PeriodType.ANNUAL,
223
- currency=Currency.USD,
224
- income_statement=income,
225
- balance_sheet=balance,
226
- cash_flow=cash,
227
- metrics=metrics,
228
- metadata=meta
229
- )
230
-
231
- @staticmethod
232
- def _extract_fiscal_year(text: str) -> date:
233
- """Finds the fiscal year end date from the text."""
234
- # Pattern 1: Year Ended December 31, 2024
235
- # Pattern 2: Period Ended ...
236
- patterns = [
237
- r"(?:YEAR|PERIOD|FISCAL YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
238
- r"DECEMBER\s+31,\s+(\d{4})"
239
- ]
240
-
241
- current_year = date.today().year
242
- found_years = []
243
-
244
- for pat in patterns:
245
- matches = re.findall(pat, text[:5000], re.IGNORECASE) # Search first 5000 chars
246
- for m in matches:
247
- if isinstance(m, tuple): m = m[0]
248
- # Extract year digit
249
- year_match = re.search(r"\d{4}", m)
250
- if year_match:
251
- y = int(year_match.group(0))
252
- if 1990 <= y <= current_year + 1:
253
- found_years.append(y)
254
-
255
- if found_years:
256
- # Most frequent or max year? Usually max year in the header is the current report year.
257
- best_year = max(found_years)
258
- return date(best_year, 12, 31) # Default to Dec 31
259
-
260
- return date.today()
261
-
262
- @staticmethod
263
- def _find_statement_pages(pdf) -> Dict[str, Any]:
264
- """ Identifies pages containing specific financial statements. """
265
- pages = {}
266
- for page in pdf.pages:
267
- text = (page.extract_text() or "").upper()
268
-
269
- # Skip Table of Contents pages (unless they contain financial data like '$')
270
- if ("TABLE OF CONTENTS" in text[:500] or "INDEX" in text[:200]) and "$" not in text[:2000]:
271
- continue
272
-
273
- # Expanded Keywords
274
- # Income
275
- if any(x in text for x in ["CONSOLIDATED STATEMENTS OF OPERATIONS", "CONSOLIDATED STATEMENTS OF INCOME", "CONSOLIDATED STATEMENTS OF EARNINGS", "DISSOLIDATED STATEMENTS OF LOSS", "STATEMENT OF INCOME", "STATEMENTS OF OPERATIONS"]):
276
- if "income" not in pages: pages["income"] = page
277
-
278
- # Balance
279
- elif any(x in text for x in ["CONSOLIDATED BALANCE SHEETS", "CONSOLIDATED STATEMENTS OF FINANCIAL POSITION", "BALANCE SHEETS", "FINANCIAL POSITION"]):
280
- if "balance" not in pages: pages["balance"] = page
281
-
282
- # Cash Flow
283
- elif any(x in text for x in ["CONSOLIDATED STATEMENTS OF CASH FLOWS", "CONSOLIDATED STATEMENT OF CASH FLOWS", "STATEMENTS OF CASH FLOWS", "CASH FLOWS"]):
284
- if "cash_flow" not in pages: pages["cash_flow"] = page
285
-
286
- return pages
287
-
288
- @staticmethod
289
- def _extract_table_data(page, allowed_fields: Optional[List[str]] = None) -> Dict[str, float]:
290
- """ Extracts key-value pairs from tables on the page with smart column selection. """
291
- data = {}
292
- tables = page.extract_tables()
293
-
294
- for table in tables:
295
- # 1. Identify "Current Year" Column
296
- # Scan first 5 rows for years (e.g., 2024, 2023)
297
- target_col_idx = -1
298
- max_year = 0
299
-
300
- headers = table[:5]
301
- for row in headers:
302
- for idx, cell in enumerate(row):
303
- if not cell: continue
304
- # Look for year pattern
305
- # Check for 4 digits that look like a recent year
306
- cleaned = cell.replace("$", "").strip()
307
- if re.match(r"^\d{4}$", cleaned):
308
- y = int(cleaned)
309
- if 2000 < y < 2100:
310
- if y > max_year:
311
- max_year = y
312
- target_col_idx = idx
313
-
314
- # If no year found, default to finding first numeric column later
315
-
316
- # 2. Header-based Scaling Detection
317
- # Look for "(in thousands)", "(in millions)", "($ in millions)", etc.
318
- multiplier = 1.0
319
-
320
- # Scan top of page text (first 1000 chars) or table headers
321
- header_text = (page.extract_text() or "")[:1000].lower()
322
- if "in millions" in header_text or "in 000s" in header_text.replace(",", ""):
323
- # Distinct from "in thousands" - some 10ks say "in 000s" meaning thousands, but let's stick to standard text
324
- pass
325
-
326
- if re.search(r"\(in millions\)|in millions, except|dollares en millones|amounts in millions|dollars in millions", header_text):
327
- multiplier = 1000000.0
328
- elif re.search(r"\(in thousands\)|in thousands, except|dollares en miles|amounts in thousands|dollars in thousands|\(in 000s\)", header_text):
329
- multiplier = 1000.0
330
-
331
- # Override if strict detected
332
- print(f"Detected scale multiplier: {multiplier}")
333
-
334
- for row in table:
335
- if not row or not row[0]: continue
336
-
337
- label = row[0]
338
- mapped_field = DataMapper.map_row(label)
339
-
340
- if mapped_field:
341
- if allowed_fields is not None and mapped_field not in allowed_fields:
342
- continue
343
-
344
- # Extract Value
345
- val = None
346
- if target_col_idx != -1 and target_col_idx < len(row):
347
- # TRUSTED COLUMN
348
- val = PDFParser._clean_value(row[target_col_idx])
349
- else:
350
- # FALLBACK: First numeric column
351
- for col_val in row[1:]:
352
- clean_val = PDFParser._clean_value(col_val)
353
- if clean_val is not None:
354
- val = clean_val
355
- break
356
-
357
- if val is not None:
358
- data[mapped_field] = val * multiplier
359
- return data
360
-
361
- @staticmethod
362
- def _clean_value(val_str: Optional[str]) -> Optional[float]:
363
- """ Converts financial string formats to float. Handles parentheses for negative. """
364
- if not val_str:
365
- return None
366
-
367
- s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
368
- if not s:
369
- return None
370
-
371
- # Handle (123) as negative
372
- if "(" in s and ")" in s:
373
- s = s.replace("(", "-").replace(")", "")
374
-
375
- # Handle - as 0 (accounting format sometimes uses - for 0)
376
- if s == "-" or s == "—":
377
- return 0.0
378
-
379
- try:
380
- return float(s)
381
- except ValueError:
382
- return None
383
-
384
- @staticmethod
385
- def _extract_via_regex(text_content: str, existing_keys: List[str]) -> Dict[str, float]:
386
- """ Fallback extraction for items not found in tables. """
387
- data = {}
388
- # Iterate over all mappings, skip if already found
389
- for field, aliases in DataMapper.FIELD_MAPPING.items():
390
- if field in existing_keys:
391
- continue
392
-
393
- for k in aliases:
394
- # Regex matches "Keyword $1,234.56" or "Keyword....... 1,234.56"
395
- pattern = re.compile(rf"{k}[^0-9-]*?(\(?[\d,]+\.?\d*\)?)", re.IGNORECASE)
396
- match = pattern.search(text_content)
397
- if match:
398
- val = PDFParser._clean_value(match.group(1))
399
- if val is not None:
400
- data[field] = val
401
- break
402
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/parser_xlsx.py DELETED
@@ -1,312 +0,0 @@
1
- """
2
- XLSX Parser - Excel file parsing for financial data.
3
-
4
- Parses Excel workbooks containing financial statements, handling:
5
- - Multi-sheet detection (Income Statement, Balance Sheet, Cash Flow)
6
- - Single-sheet condensed format
7
- - Various column/row layouts
8
- """
9
-
10
- import re
11
- from typing import Dict, Any, Optional, List
12
- from datetime import date
13
-
14
- try:
15
- import openpyxl
16
- from openpyxl import load_workbook
17
- from openpyxl.worksheet.worksheet import Worksheet
18
- except ImportError:
19
- openpyxl = None
20
-
21
- import pandas as pd
22
-
23
- from app.schemas.financial import (
24
- FinancialReport,
25
- BalanceSheetStandard,
26
- IncomeStatementStandard,
27
- CashFlowStandard,
28
- OperatingMetrics,
29
- PeriodType,
30
- Currency
31
- )
32
- from app.services.ingestion.mappings import DataMapper
33
-
34
-
35
- class XLSXParser:
36
- """Parser for Excel (.xlsx, .xls) financial files."""
37
-
38
- # Keywords to identify sheet types
39
- INCOME_KEYWORDS = ['income', 'p&l', 'profit', 'loss', 'revenue', 'earnings']
40
- BALANCE_KEYWORDS = ['balance', 'assets', 'liabilities', 'position']
41
- CASHFLOW_KEYWORDS = ['cash flow', 'cashflow', 'cash', 'liquidity']
42
-
43
- @staticmethod
44
- def parse(file_path: str) -> FinancialReport:
45
- """
46
- Parse an Excel file and return a standardized FinancialReport.
47
-
48
- Handles both multi-sheet and single-sheet formats.
49
- """
50
- if openpyxl is None:
51
- # Fallback to pandas-only parsing
52
- return XLSXParser._parse_with_pandas(file_path)
53
-
54
- try:
55
- wb = load_workbook(file_path, data_only=True)
56
-
57
- # Categorize sheets
58
- income_sheet = None
59
- balance_sheet = None
60
- cashflow_sheet = None
61
-
62
- for sheet_name in wb.sheetnames:
63
- name_lower = sheet_name.lower()
64
-
65
- if any(kw in name_lower for kw in XLSXParser.INCOME_KEYWORDS):
66
- income_sheet = wb[sheet_name]
67
- elif any(kw in name_lower for kw in XLSXParser.BALANCE_KEYWORDS):
68
- balance_sheet = wb[sheet_name]
69
- elif any(kw in name_lower for kw in XLSXParser.CASHFLOW_KEYWORDS):
70
- cashflow_sheet = wb[sheet_name]
71
-
72
- # If no specialized sheets found, use first sheet for all
73
- if not income_sheet and not balance_sheet and not cashflow_sheet:
74
- default_sheet = wb.active
75
- income_sheet = balance_sheet = cashflow_sheet = default_sheet
76
-
77
- # Extract data from each sheet
78
- data_dict = {}
79
-
80
- if income_sheet:
81
- data_dict.update(XLSXParser._extract_from_sheet(income_sheet))
82
- if balance_sheet and balance_sheet != income_sheet:
83
- data_dict.update(XLSXParser._extract_from_sheet(balance_sheet))
84
- if cashflow_sheet and cashflow_sheet != income_sheet and cashflow_sheet != balance_sheet:
85
- data_dict.update(XLSXParser._extract_from_sheet(cashflow_sheet))
86
-
87
- # If still no data, try pandas fallback
88
- if not data_dict:
89
- return XLSXParser._parse_with_pandas(file_path)
90
-
91
- # Extract company name from filename or first cell
92
- company_name = XLSXParser._extract_company_name(wb)
93
-
94
- return XLSXParser._build_report(data_dict, company_name)
95
-
96
- except Exception as e:
97
- # Fallback to pandas
98
- print(f"openpyxl parse failed, falling back to pandas: {e}")
99
- return XLSXParser._parse_with_pandas(file_path)
100
-
101
- @staticmethod
102
- def _extract_from_sheet(sheet: 'Worksheet') -> Dict[str, float]:
103
- """Extract financial data from a worksheet."""
104
- data = {}
105
-
106
- # Try to find the data range
107
- # Look for rows with label in first column and numeric value in subsequent columns
108
- for row in sheet.iter_rows(min_row=1, max_row=min(200, sheet.max_row)):
109
- if not row or not row[0].value:
110
- continue
111
-
112
- label = str(row[0].value).strip()
113
- field = DataMapper.map_row(label)
114
-
115
- if field:
116
- # Find the first non-empty numeric value in this row
117
- for cell in row[1:]:
118
- if cell.value is not None:
119
- try:
120
- val = XLSXParser._clean_value(cell.value)
121
- if val is not None:
122
- data[field] = val
123
- break
124
- except:
125
- continue
126
-
127
- return data
128
-
129
- @staticmethod
130
- def _clean_value(val: Any) -> Optional[float]:
131
- """Clean and convert a cell value to float."""
132
- if val is None:
133
- return None
134
- if isinstance(val, (int, float)):
135
- return float(val)
136
- if isinstance(val, str):
137
- # Remove currency symbols, commas, parentheses for negatives
138
- cleaned = re.sub(r'[,$]', '', val.strip())
139
- # Handle (1000) format for negatives
140
- if cleaned.startswith('(') and cleaned.endswith(')'):
141
- cleaned = '-' + cleaned[1:-1]
142
- try:
143
- return float(cleaned)
144
- except ValueError:
145
- return None
146
- return None
147
-
148
- @staticmethod
149
- def _extract_company_name(wb) -> str:
150
- """Try to extract company name from workbook."""
151
- # Check first sheet, first few cells
152
- sheet = wb.active
153
- for row in sheet.iter_rows(min_row=1, max_row=5, max_col=3):
154
- for cell in row:
155
- if cell.value and isinstance(cell.value, str):
156
- val = cell.value.strip()
157
- # Skip common headers
158
- if len(val) > 3 and len(val) < 100:
159
- lower = val.lower()
160
- if not any(kw in lower for kw in ['balance', 'income', 'cash', 'statement', 'period', 'date', 'quarter', 'annual']):
161
- return val
162
- return "Imported Company"
163
-
164
- @staticmethod
165
- def _parse_with_pandas(file_path: str) -> FinancialReport:
166
- """Fallback parsing using pandas."""
167
- try:
168
- # Read all sheets
169
- xl = pd.ExcelFile(file_path)
170
- data_dict = {}
171
-
172
- for sheet_name in xl.sheet_names:
173
- df = pd.read_excel(xl, sheet_name=sheet_name)
174
-
175
- if df.empty:
176
- continue
177
-
178
- # Try vertical format (label in col 0, value in col 1+)
179
- if len(df.columns) >= 2:
180
- for _, row in df.iterrows():
181
- label = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
182
- field = DataMapper.map_row(label)
183
- if field:
184
- # Find first numeric value
185
- for val in row.iloc[1:]:
186
- if pd.notna(val):
187
- try:
188
- data_dict[field] = float(str(val).replace(',', '').replace('$', ''))
189
- break
190
- except:
191
- continue
192
-
193
- return XLSXParser._build_report(data_dict, "Imported Company")
194
-
195
- except Exception as e:
196
- print(f"Pandas XLSX parse failed: {e}")
197
- return XLSXParser._build_empty_report()
198
-
199
- @staticmethod
200
- def _build_report(data_dict: Dict[str, float], company_name: str) -> FinancialReport:
201
- """Build FinancialReport from extracted data."""
202
- def get(key: str, default: float = 0.0) -> float:
203
- return data_dict.get(key, default)
204
-
205
- # Computed Income
206
- revenue = get("revenue")
207
- cogs = get("cogs")
208
- marketing = get("marketing_expenses")
209
- payroll = get("payroll_expenses")
210
- rent = get("rent_expense")
211
- other = get("other_operating_expenses")
212
- depreciation = get("depreciation")
213
- amortization = get("amortization")
214
- interest = get("interest_expense")
215
- taxes = get("taxes")
216
-
217
- op_expenses = marketing + payroll + rent + other
218
- gross_profit = revenue - cogs
219
- ebitda = gross_profit - op_expenses
220
- op_income = ebitda - depreciation - amortization
221
- net_income = op_income - interest - taxes
222
-
223
- income = IncomeStatementStandard(
224
- revenue=revenue,
225
- cogs=cogs,
226
- marketing_expenses=marketing,
227
- payroll_expenses=payroll,
228
- rent_expense=rent,
229
- other_operating_expenses=other,
230
- depreciation=depreciation,
231
- amortization=amortization,
232
- interest_expense=interest,
233
- taxes=taxes,
234
- # Computed
235
- operating_expenses=op_expenses,
236
- gross_profit=gross_profit,
237
- ebitda=ebitda,
238
- operating_income=op_income,
239
- net_income=net_income
240
- )
241
-
242
- # Computed Balance
243
- cash = get("cash")
244
- ar = get("accounts_receivable")
245
- inv = get("inventory")
246
- prepaid = get("prepaid_expenses")
247
- ppe = get("property_plant_equipment")
248
- accum_dep = get("accumulated_depreciation")
249
- intangibles = get("intangible_assets")
250
-
251
- ap = get("accounts_payable")
252
- accrued = get("accrued_liabilities")
253
- st_debt = get("short_term_debt")
254
- lt_debt = get("long_term_debt")
255
- deferred = get("deferred_revenue")
256
- equity = get("total_equity")
257
-
258
- bs_current_assets = cash + ar + inv + prepaid
259
- bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
260
- bs_current_liab = ap + accrued + st_debt
261
- bs_total_liab = bs_current_liab + lt_debt + deferred
262
-
263
- balance = BalanceSheetStandard(
264
- cash=cash,
265
- accounts_receivable=ar,
266
- inventory=inv,
267
- prepaid_expenses=prepaid,
268
- property_plant_equipment=ppe,
269
- accumulated_depreciation=accum_dep,
270
- intangible_assets=intangibles,
271
- accounts_payable=ap,
272
- accrued_liabilities=accrued,
273
- short_term_debt=st_debt,
274
- long_term_debt=lt_debt,
275
- deferred_revenue=deferred,
276
- total_equity=equity,
277
- # Computed
278
- total_current_assets=bs_current_assets,
279
- total_assets=bs_total_assets,
280
- total_current_liabilities=bs_current_liab,
281
- total_liabilities=bs_total_liab
282
- )
283
-
284
- cash_flow = CashFlowStandard(
285
- operating_cash_flow=get("operating_cash_flow"),
286
- capex=get("capex"),
287
- investing_cash_flow=get("investing_cash_flow"),
288
- financing_cash_flow=get("financing_cash_flow")
289
- )
290
-
291
- metrics = OperatingMetrics(
292
- industry='general',
293
- new_customers=int(get("new_customers")) if get("new_customers") else None,
294
- total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
295
- total_seats=int(get("total_seats")) if get("total_seats") else None
296
- )
297
-
298
- return FinancialReport(
299
- company_name=company_name,
300
- period_end=date.today(),
301
- period_type=PeriodType.ANNUAL,
302
- currency=Currency.USD,
303
- income_statement=income,
304
- balance_sheet=balance,
305
- cash_flow=cash_flow,
306
- metrics=metrics
307
- )
308
-
309
- @staticmethod
310
- def _build_empty_report() -> FinancialReport:
311
- """Build an empty report as last resort."""
312
- return XLSXParser._build_report({}, "Unknown Company")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/ingestion/unified_parser.py DELETED
@@ -1,84 +0,0 @@
1
- """
2
- Unified Parser - Central coordinator for all file format parsing.
3
-
4
- This module provides a single entry point for parsing any supported
5
- financial document format (CSV, PDF, XLSX).
6
- """
7
-
8
- from typing import Tuple
9
- from app.schemas.financial import FinancialReport
10
-
11
-
12
- class UnifiedParser:
13
- """
14
- Central parser that routes files to appropriate format-specific parsers.
15
-
16
- Supported formats:
17
- - CSV: Comma-separated values
18
- - PDF: PDF documents (10-K, 10-Q, financial reports)
19
- - XLSX/XLS: Excel workbooks
20
- """
21
-
22
- SUPPORTED_EXTENSIONS = {
23
- 'csv': 'csv',
24
- 'pdf': 'pdf',
25
- 'xlsx': 'xlsx',
26
- 'xls': 'xlsx', # Route both to XLSX parser
27
- }
28
-
29
- @staticmethod
30
- def get_format(filename: str) -> str:
31
- """
32
- Determine file format from filename.
33
-
34
- Returns: 'csv', 'pdf', 'xlsx', or raises ValueError
35
- """
36
- ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
37
-
38
- if ext not in UnifiedParser.SUPPORTED_EXTENSIONS:
39
- raise ValueError(f"Unsupported file format: .{ext}. Supported: .csv, .pdf, .xlsx, .xls")
40
-
41
- return UnifiedParser.SUPPORTED_EXTENSIONS[ext]
42
-
43
- @staticmethod
44
- def parse(file_path: str, filename: str) -> FinancialReport:
45
- """
46
- Parse a financial document and return standardized FinancialReport.
47
-
48
- Args:
49
- file_path: Path to the saved file on disk
50
- filename: Original filename (used for format detection)
51
-
52
- Returns:
53
- FinancialReport with standardized financial data
54
-
55
- Raises:
56
- ValueError: If file format is not supported
57
- """
58
- fmt = UnifiedParser.get_format(filename)
59
-
60
- if fmt == 'csv':
61
- from app.services.ingestion.parser_csv import CSVParser
62
- return CSVParser.parse(file_path)
63
-
64
- elif fmt == 'pdf':
65
- from app.services.ingestion.parser_dolphin import HybridPDFParser
66
- return HybridPDFParser.parse(file_path)
67
-
68
- elif fmt == 'xlsx':
69
- from app.services.ingestion.parser_xlsx import XLSXParser
70
- return XLSXParser.parse(file_path)
71
-
72
- else:
73
- raise ValueError(f"No parser available for format: {fmt}")
74
-
75
- @staticmethod
76
- def is_supported(filename: str) -> bool:
77
- """Check if a filename has a supported extension."""
78
- ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
79
- return ext in UnifiedParser.SUPPORTED_EXTENSIONS
80
-
81
- @staticmethod
82
- def get_supported_extensions() -> list:
83
- """Return list of supported file extensions."""
84
- return list(UnifiedParser.SUPPORTED_EXTENSIONS.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/intelligence/ai_cfo.py DELETED
@@ -1,52 +0,0 @@
1
- from app.schemas.financial import StandardizedDataPackage
2
- import os
3
-
4
- class AICFOService:
5
- @staticmethod
6
- def generate_executive_summary(data: StandardizedDataPackage) -> str:
7
- """
8
- Generates a natural language executive summary using a generative AI model.
9
- Currently scaffolds the prompt construction and mocks the response if no API key is present.
10
- """
11
-
12
- # 1. Construct Context
13
- company = data.raw_data.company_name
14
- revenue = data.raw_data.income_statement.revenue
15
- margin = data.kpis.net_margin
16
- score = data.risk_analysis.risk_score
17
-
18
- prompt = f"""
19
- You are an elite CFO advising the CEO of {company}.
20
- Financial Snapshot:
21
- - Annual Revenue: ${revenue:,.2f}
22
- - Net Margin: {margin:.1f}%
23
- - Overall Risk Score: {score}/100
24
- - Top Pain Points: {', '.join([p for p in data.insights if 'Pain' in p])}
25
-
26
- Write a 3-paragraph executive summary:
27
- 1. The Good: What is working well?
28
- 2. The Bad: What are the immediate risks?
29
- 3. The Ugly: What needs drastic change immediately?
30
-
31
- Keep it punchy, professional, and actionable.
32
- """
33
-
34
- # 2. Call LLM (Placeholder for Gemini)
35
- # api_key = os.getenv("GEMINI_API_KEY")
36
- # if api_key:
37
- # return call_gemini(api_key, prompt)
38
-
39
- # 3. Mock Response (Fallback)
40
- return (
41
- f"## Executive Summary for {company}\n\n"
42
- "**The Good:**\n"
43
- f"Your revenue is strong at ${revenue:,.0f}, demonstrating clear market demand. "
44
- f"A net margin of {margin:.1f}% is respectable, indicating your core unit economics are sound. "
45
- f"With a Health Score of {data.health_score.total_score}/100, the business foundation is stable.\n\n"
46
- "**The Bad:**\n"
47
- f"We detected some potential liquidity friction locally. Your burn rate suggests you might have constrained runway if sales dip. "
48
- "Optimization of COGS could yield an immediate 2-3% bottom-line improvement.\n\n"
49
- "**The Ugly:**\n"
50
- "No catastrophic risks detected immediately, but reliance on a single revenue stream could be a blind spot. "
51
- "I recommend diversifying customer acquisition channels immediately to safeguard against volatility."
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/intelligence/gemini_service.py DELETED
@@ -1,238 +0,0 @@
1
-
2
- import os
3
- import requests
4
- import json
5
- from dotenv import load_dotenv
6
- from app.schemas.chat import ChatRequest, ChatResponse
7
- from app.schemas.financial import StandardizedDataPackage
8
-
9
- # Load .env file
10
- load_dotenv()
11
-
12
- class GeminiService:
13
- API_KEY = os.getenv("GEMINI_API_KEY")
14
-
15
- # Model fallback chain - try in order, fall back if quota exceeded
16
- MODELS = [
17
- "gemini-3-flash", # Primary - fastest, newest
18
- "gemini-2.5-flash", # Fallback 1 - stable
19
- "gemini-2.5-flash-lite", # Fallback 2 - lightweight
20
- "gemini-2.0-flash", # Fallback 3 - legacy stable
21
- ]
22
-
23
- # Track which models have hit quota in this session
24
- _exhausted_models = set()
25
-
26
- @classmethod
27
- def _get_api_url(cls, model_name: str) -> str:
28
- """Generate API URL for a specific model."""
29
- return f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={cls.API_KEY}"
30
-
31
- @classmethod
32
- def _reset_exhausted_models(cls):
33
- """Reset exhausted models (call periodically or on new day)."""
34
- cls._exhausted_models.clear()
35
-
36
- @staticmethod
37
- def _parse_error_response(status_code: int, response_text: str) -> str:
38
- """
39
- Parse API error responses and return clean, user-friendly messages.
40
- Never expose raw JSON to users.
41
- """
42
- if status_code == 429:
43
- return "AI service is temporarily busy. Please try again in a few moments."
44
- elif status_code == 401 or status_code == 403:
45
- return "AI service authentication failed. Please check your API key configuration."
46
- elif status_code == 400:
47
- return "Invalid request to AI service. Please try a simpler query."
48
- elif status_code == 500:
49
- return "AI service is experiencing issues. Please try again later."
50
- elif status_code == 503:
51
- return "AI service is temporarily unavailable. Please try again later."
52
- else:
53
- return f"AI service returned an unexpected error (Code: {status_code}). Please try again."
54
-
55
- @classmethod
56
- def _try_request(cls, payload: dict, timeout: int = 30) -> tuple[bool, str, str]:
57
- """
58
- Try to make a request using available models with automatic fallback.
59
- Returns: (success: bool, response_text: str, model_used: str)
60
- """
61
- if not cls.API_KEY:
62
- return False, "Gemini API Key is missing. Please configure GEMINI_API_KEY.", ""
63
-
64
- headers = {"Content-Type": "application/json"}
65
- last_error = ""
66
-
67
- for model in cls.MODELS:
68
- # Skip models that have hit their quota this session
69
- if model in cls._exhausted_models:
70
- continue
71
-
72
- try:
73
- api_url = cls._get_api_url(model)
74
- response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
75
-
76
- if response.status_code == 200:
77
- result = response.json()
78
- try:
79
- text = result['candidates'][0]['content']['parts'][0]['text']
80
- return True, text, model
81
- except (KeyError, IndexError):
82
- last_error = "AI generated empty response."
83
- continue
84
-
85
- elif response.status_code == 429:
86
- # Model quota exceeded - mark as exhausted and try next
87
- cls._exhausted_models.add(model)
88
- print(f"Model {model} quota exceeded, trying next model...")
89
- last_error = "All AI models are currently at capacity."
90
- continue
91
-
92
- else:
93
- # Other error - try next model
94
- last_error = cls._parse_error_response(response.status_code, response.text)
95
- continue
96
-
97
- except requests.exceptions.Timeout:
98
- last_error = "AI service timed out."
99
- continue
100
- except requests.exceptions.ConnectionError:
101
- last_error = "Unable to connect to AI service."
102
- continue
103
- except Exception as e:
104
- last_error = "An unexpected error occurred."
105
- continue
106
-
107
- # All models exhausted
108
- return False, last_error, ""
109
-
110
- @classmethod
111
- def query(cls, request: ChatRequest, context_data: StandardizedDataPackage) -> ChatResponse:
112
- if not cls.API_KEY:
113
- return ChatResponse(response="Gemini API Key is missing. Please configure GEMINI_API_KEY in the backend.")
114
-
115
- # Construct Prompt with Financial Context
116
- system_prompt = f"""
117
- You are Visique, an expert AI CFO. You are analyzing the financial data for {context_data.raw_data.company_name}.
118
-
119
- Financial Context:
120
- - Revenue: {context_data.raw_data.income_statement.revenue} {context_data.raw_data.currency}
121
- - Net Income: {context_data.raw_data.income_statement.net_income}
122
- - Cash Balance: {context_data.raw_data.balance_sheet.cash}
123
- - Health Score: {context_data.health_score.total_score}/100
124
-
125
- Key Insights:
126
- {json.dumps(context_data.insights, indent=2)}
127
-
128
- Optimization Insights (Heatmap/Dead Zones):
129
- {json.dumps([z for z in context_data.optimization_insights.dead_zones] if context_data.optimization_insights else [], indent=2)}
130
-
131
- User Question: {request.message}
132
-
133
- Answer concisely as a CFO. If the user asks about "Dynamic Promos" or "Optimization", refer to the Dead Zones data.
134
- """
135
-
136
- payload = {
137
- "contents": [{
138
- "parts": [{"text": system_prompt}]
139
- }]
140
- }
141
-
142
- success, response_text, model_used = cls._try_request(payload)
143
-
144
- if success:
145
- return ChatResponse(response=response_text)
146
- else:
147
- return ChatResponse(response=response_text)
148
-
149
- @classmethod
150
- def generate_content(cls, prompt: str) -> str:
151
- """
152
- Generic generator for internal services (like GeoService).
153
- Uses automatic model fallback. Returns clean, presentable text.
154
- """
155
- if not cls.API_KEY:
156
- return "Strategic insights require AI configuration. Contact support for assistance."
157
-
158
- payload = {
159
- "contents": [{
160
- "parts": [{"text": prompt}]
161
- }]
162
- }
163
-
164
- success, response_text, model_used = cls._try_request(payload)
165
-
166
- if success:
167
- return response_text
168
- else:
169
- # Return intelligent fallback content instead of error
170
- return cls._get_fallback_content(prompt)
171
-
172
- @staticmethod
173
- def _get_fallback_content(prompt: str) -> str:
174
- """
175
- Provide meaningful fallback content when ALL AI models are unavailable.
176
- This ensures reports and displays never show error messages.
177
- """
178
- prompt_lower = prompt.lower()
179
-
180
- if "competitor" in prompt_lower or "landscape" in prompt_lower:
181
- return """**Market Analysis**
182
-
183
- Based on industry standards for your sector:
184
-
185
- • **Primary Competition**: Focus on businesses within a 5-mile radius offering similar services
186
- • **Traffic Patterns**: Peak hours typically align with lunch (11am-2pm) and evening (5pm-8pm) periods
187
- • **Differentiation**: Evaluate unique value propositions against local alternatives
188
-
189
- *AI-powered real-time analysis available when capacity permits.*"""
190
-
191
- elif "strategic" in prompt_lower or "context" in prompt_lower:
192
- return """**Strategic Context Overview**
193
-
194
- Key considerations for your market:
195
-
196
- • **Regulatory Environment**: Stay current with local business regulations and licensing requirements
197
- • **Economic Indicators**: Monitor regional employment and consumer spending trends
198
- • **Industry Outlook**: Your sector shows stable fundamentals with growth potential
199
-
200
- *Enhanced AI insights will be available shortly.*"""
201
-
202
- elif "marketing" in prompt_lower or "growth" in prompt_lower:
203
- return """**Growth Strategy Framework**
204
-
205
- Recommended focus areas for sustainable growth:
206
-
207
- • **Digital Presence**: Optimize Google Business Profile and local SEO
208
- • **Customer Retention**: Implement loyalty programs to increase lifetime value
209
- • **Community Engagement**: Partner with local organizations for visibility
210
-
211
- *AI-powered personalized recommendations available when capacity permits.*"""
212
-
213
- else:
214
- return """**Analysis Summary**
215
-
216
- Your financial data has been processed successfully. Key takeaways:
217
-
218
- • Review the health score breakdown for areas of strength and improvement
219
- • Monitor cash runway projections for operational planning
220
- • Consider the recommendations provided for optimization opportunities
221
-
222
- *For deeper AI-driven insights, please try again in a few minutes.*"""
223
-
224
- @classmethod
225
- def get_model_status(cls) -> dict:
226
- """
227
- Get current status of available models (for debugging/admin).
228
- """
229
- available_models = [m for m in cls.MODELS if m not in cls._exhausted_models]
230
- exhausted = list(cls._exhausted_models)
231
-
232
- return {
233
- "total_models": len(cls.MODELS),
234
- "available_models": available_models,
235
- "exhausted_models": exhausted,
236
- "all_exhausted": len(available_models) == 0
237
- }
238
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/intelligence/geo_service.py DELETED
@@ -1,104 +0,0 @@
1
-
2
- import random
3
-
4
- class GeoService:
5
- @staticmethod
6
- def analyze_location(address: str, industry: str = "General", is_own_company: bool = False, company_name: str = ""):
7
- """
8
- Generates strategic analysis using Google Gemini if available,
9
- otherwise falls back to simulation.
10
-
11
- :param address: The address to analyze
12
- :param industry: The industry type
13
- :param is_own_company: Whether this is the user's own company (enables more personalized insights)
14
- :param company_name: Name of the company being analyzed
15
- """
16
- from app.services.intelligence.gemini_service import GeminiService
17
-
18
- context_prefix = f"for {company_name}" if company_name else ""
19
- personalization = "your business" if is_own_company else f"this {industry} business"
20
-
21
- # Check for Real AI Capability
22
- if GeminiService.API_KEY:
23
- try:
24
- # 1. Competitor Landscape
25
- p1 = f"Analyze the competitor landscape {context_prefix} for a {industry} business located at {address}. {'As the owner, provide actionable competitive intelligence.' if is_own_company else 'Provide general market context.'} Identify 3 competitors and describe the traffic patterns in the area. Limit to 150 words. Format with **Bold** headers."
26
- comp_summary = GeminiService.generate_content(p1)
27
-
28
- # 2. Strategic Context
29
- p2 = f"Provide a brief strategic context analysis for {address} regarding local regulations, news events, and economic sentiment for the {industry} sector {context_prefix}. {'Include specific recommendations for the owner.' if is_own_company else ''} Limit to 150 words."
30
- context_summary = GeminiService.generate_content(p2)
31
-
32
- # 3. Marketing Strategy
33
- p3 = f"Suggest a growth and marketing strategy for {personalization} at {address}. {'Be specific with actionable next steps for the owner to implement.' if is_own_company else 'Provide general market positioning advice.'} Include digital positioning advice and 2 actionable recommendations. Limit to 150 words."
34
- marketing_summary = GeminiService.generate_content(p3)
35
-
36
- return {
37
- "competitor_analysis": comp_summary,
38
- "strategic_context": context_summary,
39
- "marketing_strategy": marketing_summary
40
- }
41
- except Exception as e:
42
- print(f"Gemini Generation Failed: {e}. Falling back to simulation.")
43
- # Fallthrough to default logic below
44
-
45
- # ... FALLBACK MOCK DATA ...
46
- # Mocking external data capabilities
47
- competitors = [
48
- "Alpha Competitor Inc.", "Beta Rivals LLC", "Local Market Leader"
49
- ] if industry != "Restaurant" else [
50
- "The Hungry Chef", "Burger King", "Downtown Bistro"
51
- ]
52
-
53
- ownership_note = "As the owner of this business," if is_own_company else "For this business,"
54
- company_ref = company_name if company_name else "the business"
55
-
56
- # 1. Competitor & Location Analysis (Page 1 content)
57
- comp_summary = f"""
58
- **Location Analysis for:** {address}
59
- **Company:** {company_ref}
60
- **Industry Focus:** {industry}
61
-
62
- **Competitor Landscape:**
63
- {ownership_note} we have identified {len(competitors)} primary competitors within a 5-mile radius:
64
- {', '.join(competitors)}.
65
-
66
- **Traffic Patterns:**
67
- Based on historical data, the highest foot traffic in your area occurs between 11:00 AM and 2:00 PM on weekdays.
68
-
69
- **Site Accessibility:**
70
- Your location has a Walk Score of {random.randint(40, 95)}/100 and Transit Score of {random.randint(30, 80)}/100.
71
- """
72
-
73
- # 2. Political & Local News Context (Page 2 content)
74
- context_summary = f"""
75
- **Strategic Context: Local & Political Landscape**
76
-
77
- **Regulatory Updates:**
78
- Recent city council proceedings indicate a favorable shift for {industry} businesses.
79
-
80
- **Economic Sentiment:**
81
- Local consumer sentiment is currently 'Optimistic' with a spending index of {random.randint(90, 110)}.
82
-
83
- {"**Owner Action Item:** Engage with local business association for networking opportunities." if is_own_company else ""}
84
- """
85
-
86
- # 3. Marketing & Growth Opportunities (Page 3 content)
87
- marketing_summary = f"""
88
- **Growth & Marketing Strategy for {company_ref}**
89
-
90
- **Key Marketing Events:**
91
- Leverage upcoming local opportunities like the Annual City Festival.
92
-
93
- **Actionable Recommendations:**
94
- 1. **Hyper-Local SEO:** {"Optimize your" if is_own_company else "Optimize the"} Google Business Profile for '{company_ref}'.
95
- 2. **Community Partnerships:** Engage with local news events and neighborhood associations.
96
- {"3. **Owner Priority:** Focus on building customer reviews - aim for 50+ 5-star reviews." if is_own_company else ""}
97
- """
98
-
99
- return {
100
- "competitor_analysis": comp_summary,
101
- "strategic_context": context_summary,
102
- "marketing_strategy": marketing_summary
103
- }
104
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/intelligence/rag.py DELETED
@@ -1,35 +0,0 @@
1
- from app.schemas.chat import ChatRequest, ChatResponse, Message
2
- from app.schemas.financial import StandardizedDataPackage
3
-
4
- class RAGService:
5
- @staticmethod
6
- def query(request: ChatRequest, data_context: StandardizedDataPackage) -> ChatResponse:
7
- """
8
- Scaffolds the RAG logic.
9
- In a real implementation, this would:
10
- 1. Chunk the 'data_context' into vectors (Income, Balance, Risk).
11
- 2. Embed the 'request.messages[-1].content'.
12
- 3. Retrieve relevant chunks.
13
- 4. Synthesize an answer via LLM.
14
- """
15
-
16
- last_message = request.messages[-1].content.lower()
17
-
18
- # Simple Keyword Matching (Mock RAG)
19
- extracted_info = []
20
- if "revenue" in last_message:
21
- extracted_info.append(f"Revenue: ${data_context.raw_data.income_statement.revenue:,.2f}")
22
- if "net income" in last_message or "profit" in last_message:
23
- extracted_info.append(f"Net Income: ${data_context.raw_data.income_statement.net_income:,.2f}")
24
- if "margin" in last_message:
25
- extracted_info.append(f"Net Margin: {data_context.kpis.net_margin}%")
26
-
27
- if not extracted_info:
28
- response_text = "I am a financial AI. Ask me about Revenue, Margins, or Risk."
29
- else:
30
- response_text = "Based on the latest financial data:\n- " + "\n- ".join(extracted_info)
31
-
32
- return ChatResponse(
33
- response=response_text,
34
- sources=["Financial Report Q4", "KPI Analysis Module"]
35
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/reporting/pdf_report.py DELETED
@@ -1,565 +0,0 @@
1
-
2
-
3
- from reportlab.lib.pagesizes import letter
4
- from reportlab.lib import colors
5
- from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
6
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Frame, PageTemplate, Image
7
- from reportlab.lib.units import inch
8
- from reportlab.pdfgen import canvas
9
- from reportlab.graphics.shapes import Drawing
10
- from reportlab.graphics.charts.barcharts import VerticalBarChart
11
- from reportlab.graphics.charts.linecharts import HorizontalLineChart
12
- from reportlab.graphics.charts.piecharts import Pie
13
- from reportlab.lib.colors import HexColor
14
- from app.schemas.financial import StandardizedDataPackage
15
- import os
16
- import re
17
- from datetime import datetime
18
- from pypdf import PdfReader, PdfWriter, PageObject
19
- import io
20
-
21
- class PDFReporter:
22
-
23
- TEMPLATE_PATH = "app/assets/report_template.pdf"
24
-
25
- @staticmethod
26
- def _sanitize_content(text: str) -> str:
27
- """Clean AI-generated content."""
28
- if not text:
29
- return ""
30
-
31
- # Remove JSON blocks and API error responses
32
- text = re.sub(r'\{[^}]*"@type"[^}]*\}', '', text)
33
- text = re.sub(r'\{[^}]*"quotaMetric"[^}]*\}', '', text)
34
- text = re.sub(r'\[\s*\{.*?\}\s*\]', '', text, flags=re.DOTALL)
35
- text = re.sub(r'"[a-zA-Z_]+"\s*:\s*"[^"]*"', '', text)
36
- text = re.sub(r'AI Error:\s*\d+.*', '', text)
37
- text = re.sub(r'System Error:.*', '', text)
38
- text = re.sub(r'https?://[^\s]+', '', text)
39
-
40
- # Clean up markdown formatting
41
- text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text) # Bold
42
- text = text.replace("##", "").replace("###", "").replace("#", "")
43
- text = re.sub(r'(?<!\S)\*\s+', '• ', text) # Bullet points
44
-
45
- text = re.sub(r'\n{3,}', '\n\n', text)
46
- text = re.sub(r'[ \t]+', ' ', text)
47
- text = text.strip()
48
- text = re.sub(r'[\[\]{}]', '', text)
49
-
50
- return text if text else "Analysis data will be available upon API configuration."
51
-
52
- @staticmethod
53
- def _create_pie_chart(data_dict, title="Breakdown", width=400, height=200):
54
- """Create a Pie Chart."""
55
- drawing = Drawing(width, height)
56
- pc = Pie()
57
- # Scaling - constrain by the smaller dimension with margin
58
- size = min(width, height) - 20
59
- if size < 50: size = 50 # Minimum size
60
-
61
- pc.width = size
62
- pc.height = size
63
- pc.x = (width - size) / 2
64
- pc.y = (height - size) / 2
65
-
66
- # Filter zero values
67
- labels = []
68
- data = []
69
- for k, v in data_dict.items():
70
- if v and v > 0:
71
- labels.append(k)
72
- data.append(v)
73
-
74
- if not data:
75
- return drawing
76
-
77
- pc.data = data
78
- pc.labels = labels
79
- pc.slices.strokeWidth = 0.5
80
-
81
- # Visique Colors
82
- colors_list = [
83
- HexColor("#0891b2"), HexColor("#0f172a"), HexColor("#38bdf8"),
84
- HexColor("#94a3b8"), HexColor("#cffafe")
85
- ]
86
- for i in range(len(data)):
87
- pc.slices[i].fillColor = colors_list[i % len(colors_list)]
88
-
89
- drawing.add(pc)
90
- return drawing
91
-
92
- @staticmethod
93
- def _create_chart_with_description(data: StandardizedDataPackage, type='revenue', width=400, height=200):
94
- """Creates charts for the report with centering and descriptions. Width/Height control drawing size."""
95
- drawing = Drawing(width, height)
96
- desc_text = ""
97
-
98
- if type == 'revenue':
99
- rev = data.raw_data.income_statement.revenue or 0
100
- exp = data.raw_data.income_statement.operating_expenses or 0
101
- net = data.raw_data.income_statement.net_income or 0
102
-
103
- # Description Logic
104
- margin = (net / rev * 100) if rev else 0
105
- if rev >= 1e9:
106
- rev_str = f"${rev/1e9:.1f}B"
107
- else:
108
- rev_str = f"${rev/1e6:.1f}M"
109
-
110
- desc_text = f"<b>Performance Overview:</b> The company generated <b>{rev_str}</b> in revenue. <br/>Net profit margin stands at <b>{margin:.1f}%</b> after expenses."
111
-
112
- data_vals = [(rev, exp, net)]
113
- bc = VerticalBarChart()
114
- bc.x = width * 0.15 # dynamic margin
115
- bc.y = 50
116
- bc.height = height * 0.6
117
- bc.width = width * 0.75
118
- bc.data = data_vals
119
- bc.strokeColor = colors.white
120
- max_val = max(rev, exp, net, 100)
121
- bc.valueAxis.valueMin = 0
122
- bc.valueAxis.valueMax = max_val * 1.1
123
- bc.valueAxis.valueStep = max_val / 4
124
- bc.valueAxis.labelTextFormat = lambda x: f'{x/1e9:.1f}B' if x >= 1e9 else f'{x/1e6:.0f}M'
125
-
126
- bc.categoryAxis.categoryNames = ['Revenue', 'Op. Expenses', 'Net Income']
127
- bc.bars[0].fillColor = colors.HexColor("#0891b2")
128
- drawing.add(bc)
129
-
130
- elif type == 'runway':
131
- if data.runway_forecast:
132
- burn = abs(data.runway_forecast.burn_rate_monthly or 0)
133
- months = min(data.runway_forecast.months_left or 0, 24)
134
- cash = data.raw_data.balance_sheet.cash or 0
135
-
136
- desc_text = f"<b>Cash Runway:</b> Based on a monthly burn of <b>${burn:,.0f}</b>,<br/>cash reserves will support operations for <b>{months:.1f} months</b>."
137
-
138
- if burn > 0:
139
- projection = [max(0, cash - (burn * i)) for i in range(int(months) + 2)]
140
- lc = HorizontalLineChart()
141
- lc.x = width * 0.15
142
- lc.y = 50
143
- lc.height = height * 0.6
144
- lc.width = width * 0.75
145
- lc.data = [projection]
146
- lc.joinedLines = 1
147
- lc.categoryAxis.categoryNames = [f"M{i}" for i in range(len(projection))]
148
- lc.valueAxis.valueMin = 0
149
- lc.lines[0].strokeColor = colors.HexColor("#06b6d4")
150
- lc.lines[0].strokeWidth = 2
151
- drawing.add(lc)
152
- else:
153
- desc_text = "Runway data unavailable."
154
-
155
- elif type == 'expenses_pie':
156
- expenses = {
157
- "COGS": data.raw_data.income_statement.cogs,
158
- "Payroll": data.raw_data.income_statement.payroll_expenses,
159
- "Marketing": data.raw_data.income_statement.marketing_expenses,
160
- "Rent": data.raw_data.income_statement.rent_expense,
161
- "Other": data.raw_data.income_statement.other_operating_expenses
162
- }
163
- drawing = PDFReporter._create_pie_chart(expenses, width=width, height=height)
164
- desc_text = "<b>Expense Profile:</b> Breakdown of major cost centers.<br/>Monitor COGS and Payroll trends."
165
-
166
- elif type == 'assets_pie':
167
- assets = {
168
- "Cash": data.raw_data.balance_sheet.cash,
169
- "Receivables": data.raw_data.balance_sheet.accounts_receivable,
170
- "Inventory": data.raw_data.balance_sheet.inventory,
171
- "Property/Eq": data.raw_data.balance_sheet.property_plant_equipment,
172
- "Intangibles": data.raw_data.balance_sheet.intangible_assets,
173
- }
174
- drawing = PDFReporter._create_pie_chart(assets, width=width, height=height)
175
- desc_text = "<b>Asset Mix:</b> Composition of short vs long term assets.<br/>Liquidity is key for stability."
176
-
177
- # Wrapper Table for Centering & Description
178
- styles = getSampleStyleSheet()
179
- desc_style = ParagraphStyle('ChartDesc', parent=styles['Normal'], fontSize=9, leading=11, alignment=1, textColor=colors.HexColor("#64748b"))
180
-
181
- t = Table([[drawing], [Paragraph(desc_text, desc_style)]], colWidths=[width], rowHeights=[height+10, 40])
182
- t.setStyle(TableStyle([
183
- ('ALIGN', (0,0), (-1,-1), 'CENTER'),
184
- ('VALIGN', (0,0), (-1,-1), 'TOP'),
185
- ]))
186
- return t
187
-
188
- @staticmethod
189
- def cover_template_header(canvas, doc):
190
- """Draws a white box to mask the template's placeholder text."""
191
- canvas.saveState()
192
- canvas.setFillColor(colors.white)
193
- # Position: Top of page, below logo, covering center mess
194
- # Page height is 11 inch = 792 pt
195
- # Logo is usually at top 1 inch (y=720+).
196
- # Placeholders "Company Name" etc usually around y=700.
197
- canvas.rect(0, 680, 612, 60, fill=1, stroke=0)
198
- canvas.restoreState()
199
-
200
- @staticmethod
201
- def _create_chart(data: StandardizedDataPackage, type='revenue'):
202
- """Creates charts for the report."""
203
- drawing = Drawing(400, 200)
204
-
205
- if type == 'revenue':
206
- # Revenue vs Expenses vs Net Income
207
- rev = data.raw_data.income_statement.revenue or 0
208
- exp = data.raw_data.income_statement.operating_expenses or 0
209
- net = data.raw_data.income_statement.net_income or 0
210
-
211
- data_vals = [(rev, exp, net)]
212
-
213
- bc = VerticalBarChart()
214
- bc.x = 50
215
- bc.y = 50
216
- bc.height = 125
217
- bc.width = 300
218
- bc.data = data_vals
219
- bc.strokeColor = colors.white
220
-
221
- # Dynamic axis scaling
222
- max_val = max(rev, exp, net, 100)
223
- bc.valueAxis.valueMin = 0
224
- bc.valueAxis.valueMax = max_val * 1.1
225
- bc.valueAxis.valueStep = max_val / 5
226
-
227
- bc.categoryAxis.labels.boxAnchor = 'ne'
228
- bc.categoryAxis.labels.dx = 8
229
- bc.categoryAxis.labels.dy = -2
230
- bc.categoryAxis.categoryNames = ['Revenue', 'Op. Expenses', 'Net Income']
231
- bc.bars[0].fillColor = colors.HexColor("#0891b2")
232
- drawing.add(bc)
233
-
234
- elif type == 'runway':
235
- # Simple burn rate projection
236
- burn = 0
237
- months = 0
238
-
239
- if data.runway_forecast:
240
- burn = abs(data.runway_forecast.burn_rate_monthly or 0)
241
- months = min(data.runway_forecast.months_left or 0, 24)
242
-
243
- cash = data.raw_data.balance_sheet.cash or 0
244
-
245
- # Projected cash balance line
246
- if burn > 0:
247
- projection = [max(0, cash - (burn * i)) for i in range(int(months) + 2)]
248
-
249
- lc = HorizontalLineChart()
250
- lc.x = 50
251
- lc.y = 50
252
- lc.height = 125
253
- lc.width = 300
254
- lc.data = [projection]
255
- lc.joinedLines = 1
256
- lc.categoryAxis.categoryNames = [f"M{i}" for i in range(len(projection))]
257
- lc.valueAxis.valueMin = 0
258
- lc.lines[0].strokeColor = colors.HexColor("#06b6d4")
259
- lc.lines[0].strokeWidth = 2
260
- drawing.add(lc)
261
-
262
- return drawing
263
-
264
- @staticmethod
265
- def generate(data: StandardizedDataPackage, filename: str):
266
- # 1. Generate content PDF using ReportLab
267
- packet = io.BytesIO()
268
- doc = SimpleDocTemplate(
269
- packet,
270
- pagesize=letter,
271
- rightMargin=inch,
272
- leftMargin=inch,
273
- topMargin=1.5*inch, # More space for header
274
- bottomMargin=1*inch
275
- )
276
-
277
- # Styles
278
- styles = getSampleStyleSheet()
279
-
280
- title_style = ParagraphStyle('VisiqueTitle', parent=styles['Heading1'], fontSize=26, textColor=colors.HexColor("#0f172a"), spaceAfter=25, fontName='Helvetica-Bold')
281
- section_style = ParagraphStyle('VisiqueSection', parent=styles['Heading1'], fontSize=30, textColor=colors.HexColor("#0f172a"), spaceBefore=100, spaceAfter=20, alignment=1, fontName='Helvetica-Bold')
282
- header_style = ParagraphStyle('VisiqueHeader', parent=styles['Heading2'], fontSize=16, textColor=colors.HexColor("#334155"), spaceBefore=20, spaceAfter=10, keepWithNext=True, fontName='Helvetica-Bold')
283
- body_style = ParagraphStyle('VisiqueBody', parent=styles['Normal'], fontSize=11, leading=15, spaceAfter=10, textColor=colors.HexColor("#334155"), fontName='Helvetica')
284
- score_style = ParagraphStyle('ScoreStyle', parent=styles['Normal'], fontSize=32, leading=36, textColor=colors.HexColor("#0ea5e9"), alignment=1, fontName='Helvetica-Bold')
285
- tiny_meta = ParagraphStyle('TinyMeta', parent=styles['Normal'], fontSize=8, textColor=colors.gray)
286
-
287
- elements = []
288
-
289
- # === PAGE 1: EXECUTIVE SUMMARY ===
290
- elements.append(Paragraph(f"Financial Intelligence Report", title_style))
291
- elements.append(Paragraph(f"<b>Target Entity:</b> {data.raw_data.company_name}", body_style))
292
- elements.append(Paragraph(f"<b>Reporting Period:</b> {data.raw_data.period_end}", body_style))
293
- elements.append(Spacer(1, 20))
294
-
295
- # Health Score Box
296
- elements.append(Paragraph("Strategic Health Score", header_style))
297
- score_data = [[
298
- Paragraph(f"<b>{data.health_score.total_score:.0f}</b> / 100", score_style),
299
- [
300
- Paragraph(f"• Stability: {data.health_score.stability:.0f}/25", body_style),
301
- Paragraph(f"• Profitability: {data.health_score.profitability:.0f}/35", body_style),
302
- Paragraph(f"• Growth: {data.health_score.growth:.0f}/10", body_style),
303
- Paragraph(f"• Efficiency: {data.health_score.efficiency:.0f}/20", body_style),
304
- ]
305
- ]]
306
- score_table = Table(score_data, colWidths=[2*inch, 4*inch])
307
- score_table.setStyle(TableStyle([
308
- ('ALIGN', (0,0), (0,0), 'CENTER'),
309
- ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
310
- ('BOX', (0,0), (0,0), 1, colors.HexColor("#0891b2")),
311
- ('ROUNDEDCORNERS', [10, 10, 10, 10]),
312
- ('TOPPADDING', (0,0), (-1,-1), 15),
313
- ('BOTTOMPADDING', (0,0), (-1,-1), 15),
314
- ]))
315
- elements.append(score_table)
316
- elements.append(Spacer(1, 15))
317
-
318
- # Executive Insights (Top 3)
319
- # Replacing simple list with Risk/Win Table
320
-
321
- wins = []
322
- risks = []
323
-
324
- # Parse insights for win/risk
325
- if data.insights:
326
- for insight in data.insights:
327
- if any(x in insight for x in ["Risk", "High", "Decrease", "Burn", "Negative"]):
328
- if len(risks) < 3: risks.append(insight)
329
- else:
330
- if len(wins) < 3: wins.append(insight)
331
-
332
- # Ensure at least some data
333
- if not wins: wins = ["Stable operations detected."]
334
- if not risks: risks = ["No critical risks detected."]
335
-
336
- rw_data = [
337
- [Paragraph("<b>Key Wins</b>", body_style), Paragraph("<b>Risk Factors</b>", body_style)],
338
- [[Paragraph(f"�� {PDFReporter._sanitize_content(w)}", body_style) for w in wins],
339
- [Paragraph(f"• {PDFReporter._sanitize_content(r)}", body_style) for r in risks]]
340
- ]
341
-
342
- rw_table = Table(rw_data, colWidths=[3*inch, 3*inch])
343
- rw_table.setStyle(TableStyle([
344
- ('BACKGROUND', (0,0), (0,0), colors.HexColor("#dcfce7")), # Light Green
345
- ('BACKGROUND', (1,0), (1,0), colors.HexColor("#fee2e2")), # Light Red
346
- ('VALIGN', (0,0), (-1,-1), 'TOP'),
347
- ('GRID', (0,0), (-1,-1), 0.5, colors.grey),
348
- ('TOPPADDING', (0,0), (-1,-1), 6),
349
- ('BOTTOMPADDING', (0,0), (-1,-1), 6),
350
- ]))
351
- elements.append(rw_table)
352
-
353
- elements.append(PageBreak())
354
-
355
- # === PAGE 2: INCOME STATEMENT ===
356
- elements.append(Paragraph("Income Statement Analysis", title_style))
357
-
358
- # Charts Row - Now using description wrapper which is a Table itself
359
- # To side-by-side, we need a wrapper table
360
- # Page width 8.5in. Margins 1in. Content = 6.5in.
361
- # Split 2 cols = 3.25in each = ~234 points.
362
- col_w = 3.2 * inch
363
-
364
- c1 = PDFReporter._create_chart_with_description(data, 'revenue', width=220, height=180)
365
- c2 = PDFReporter._create_chart_with_description(data, 'expenses_pie', width=220, height=180)
366
-
367
- chart_container = Table([[c1, c2]], colWidths=[col_w, col_w])
368
- chart_container.setStyle(TableStyle([
369
- ('ALIGN', (0,0), (-1,-1), 'CENTER'),
370
- ('VALIGN', (0,0), (-1,-1), 'TOP'),
371
- ]))
372
- elements.append(chart_container)
373
-
374
- elements.append(Spacer(1, 15))
375
-
376
- income_data = [
377
- ["Metric", "Value"],
378
- ["Revenue", f"${data.raw_data.income_statement.revenue:,.2f}"],
379
- ["COGS", f"${data.raw_data.income_statement.cogs:,.2f}"],
380
- ["Gross Profit", f"${data.raw_data.income_statement.gross_profit:,.2f}"],
381
- ["Op. Expenses", f"${data.raw_data.income_statement.operating_expenses:,.2f}"],
382
- ["Net Income", f"${data.raw_data.income_statement.net_income:,.2f}"],
383
- ]
384
- t_income = Table(income_data, colWidths=[3.5*inch, 2.5*inch])
385
- t_income.setStyle(TableStyle([
386
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#0891b2")),
387
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
388
- ('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
389
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
390
- ('GRID', (0, 0), (-1, -1), 1, colors.HexColor("#e2e8f0")),
391
- ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
392
- ('TOPPADDING', (0,0), (-1,-1), 8),
393
- ('BOTTOMPADDING', (0,0), (-1,-1), 8),
394
- ]))
395
- elements.append(t_income)
396
- elements.append(PageBreak())
397
-
398
- # === PAGE 3: BALANCE SHEET ===
399
- elements.append(Paragraph("Balance Sheet & Ratios", title_style))
400
-
401
- # Add Asset Chart (Centered)
402
- c_assets = PDFReporter._create_chart_with_description(data, 'assets_pie')
403
- # Center horizontally
404
- t_asset_wrapper = Table([[c_assets]], colWidths=[7*inch])
405
- t_asset_wrapper.setStyle(TableStyle([('ALIGN', (0,0), (-1,-1), 'CENTER')]))
406
- elements.append(t_asset_wrapper)
407
-
408
- elements.append(Spacer(1, 10))
409
-
410
- kpi_data = [["Key Ratio", "Value", "Benchmark"]]
411
- if data.kpis:
412
- kpi_data.append(["Current Ratio", f"{data.kpis.current_ratio:.2f}x", "> 1.5x"])
413
- kpi_data.append(["Debt-to-Equity", f"{data.kpis.debt_to_equity:.2f}x", "< 2.0x"])
414
- kpi_data.append(["Return on Equity", f"{data.kpis.roe:.1%}", "15-20%"])
415
- kpi_data.append(["DSO", f"{data.kpis.dso:.0f} days", "< 45 days"])
416
-
417
- t_kpi = Table(kpi_data, colWidths=[2.5*inch, 1.5*inch, 1.5*inch])
418
- t_kpi.setStyle(TableStyle([
419
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor("#0f172a")),
420
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
421
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
422
- ('GRID', (0, 0), (-1, -1), 1, colors.HexColor("#e2e8f0")),
423
- ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor("#f1f5f9")]),
424
- ('TOPPADDING', (0,0), (-1,-1), 8),
425
- ('BOTTOMPADDING', (0,0), (-1,-1), 8),
426
- ]))
427
- elements.append(t_kpi)
428
- elements.append(Spacer(1, 20))
429
-
430
- bs_data = [
431
- ["Balance Sheet Item", "Value"],
432
- ["Total Assets", f"${data.raw_data.balance_sheet.total_assets:,.2f}"],
433
- [" Cash & Equiv.", f"${data.raw_data.balance_sheet.cash:,.2f}"],
434
- ["Total Liabilities", f"${data.raw_data.balance_sheet.total_liabilities:,.2f}"],
435
- [" Short Term Debt", f"${data.raw_data.balance_sheet.short_term_debt:,.2f}"],
436
- [" Long Term Debt", f"${data.raw_data.balance_sheet.long_term_debt:,.2f}"],
437
- ["Total Equity", f"${data.raw_data.balance_sheet.total_equity:,.2f}"],
438
- ]
439
- t_bs = Table(bs_data, colWidths=[3.5*inch, 2.5*inch])
440
- t_bs.setStyle(TableStyle([
441
- ('LINEBELOW', (0,0), (-1,0), 1, colors.black),
442
- ('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
443
- ('TOPPADDING', (0,0), (-1,-1), 6),
444
- ('BOTTOMPADDING', (0,0), (-1,-1), 6),
445
- ]))
446
- elements.append(t_bs)
447
- elements.append(PageBreak())
448
-
449
- # === PAGE 4: CASH FLOW & RUNWAY ===
450
- elements.append(Paragraph("Cash Flow & Runway", title_style))
451
-
452
- if data.runway_forecast and data.runway_forecast.burn_rate_monthly > 0:
453
- c_runway = PDFReporter._create_chart_with_description(data, 'runway')
454
- t_runway_wrapper = Table([[c_runway]], colWidths=[7*inch])
455
- t_runway_wrapper.setStyle(TableStyle([('ALIGN', (0,0), (-1,-1), 'CENTER')]))
456
- elements.append(t_runway_wrapper)
457
- else:
458
- elements.append(Paragraph("Positive Cash Flow Generation", header_style))
459
- elements.append(Paragraph("This entity is cash flow positive and does not have a finite runway.", body_style))
460
-
461
- elements.append(Spacer(1, 20))
462
- cf_data = [
463
- ["Cash Flow Metric", "Value"],
464
- ["Operating Cash Flow", f"${data.raw_data.cash_flow.operating_cash_flow:,.2f}"],
465
- ["Investing Cash Flow", f"${data.raw_data.cash_flow.investing_cash_flow:,.2f}"],
466
- ["Financing Cash Flow", f"${data.raw_data.cash_flow.financing_cash_flow:,.2f}"],
467
- ["Net Change in Cash", f"${data.raw_data.cash_flow.net_change_in_cash:,.2f}"],
468
- ]
469
- t_cf = Table(cf_data, colWidths=[3.5*inch, 2.5*inch])
470
- t_cf.setStyle(TableStyle([
471
- ('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
472
- ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
473
- ('TOPPADDING', (0,0), (-1,-1), 6),
474
- ('BOTTOMPADDING', (0,0), (-1,-1), 6),
475
- ]))
476
- elements.append(t_cf)
477
- elements.append(PageBreak())
478
-
479
- # === PAGE 5: STRATEGIC INTELLIGENCE ===
480
- elements.append(Paragraph("Strategic Intelligence", title_style))
481
- if data.geo_analysis:
482
- elements.append(Paragraph("Market Context", header_style))
483
- elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.strategic_context), body_style))
484
-
485
- elements.append(Paragraph("Competitors", header_style))
486
- elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.competitor_analysis), body_style))
487
-
488
- elements.append(Paragraph("Growth Strategy", header_style))
489
- elements.append(Paragraph(PDFReporter._sanitize_content(data.geo_analysis.marketing_strategy), body_style))
490
- else:
491
- elements.append(Paragraph("Strategic data unavailable.", body_style))
492
- elements.append(PageBreak())
493
-
494
- # === PAGE 6: AI CFO RECOMMENDATIONS ===
495
- elements.append(Paragraph("Predictive Outlook & Recommendations", title_style))
496
-
497
- if data.insights:
498
- # Skip first 3 used in exec summary
499
- for i, insight in enumerate(data.insights[3:], 1):
500
- elements.append(Paragraph(f"Recommendation #{i}", header_style))
501
- elements.append(Paragraph(PDFReporter._sanitize_content(insight), body_style))
502
-
503
- elements.append(PageBreak())
504
-
505
- # === PAGE 7: APPENDIX ===
506
- elements.append(Paragraph("Appendix: Full Data", title_style))
507
- elements.append(Paragraph("Raw data extraction log.", body_style))
508
- elements.append(Paragraph(f"Generated by Visique Engine v2.1 on {datetime.now()}", tiny_meta))
509
-
510
- # Use onFirstPage and onLaterPages to draw the white box
511
- doc.build(elements, onFirstPage=PDFReporter.cover_template_header, onLaterPages=PDFReporter.cover_template_header)
512
- packet.seek(0)
513
-
514
- # 2. Overlay onto Template
515
- try:
516
- # Load Template
517
- template_path = os.path.join(os.getcwd(), PDFReporter.TEMPLATE_PATH)
518
- if not os.path.exists(template_path):
519
- # Fallback if template missing - just save the raw pdf
520
- with open(filename, "wb") as f:
521
- f.write(packet.getbuffer())
522
- return filename
523
-
524
- template_pdf = PdfReader(template_path)
525
- content_pdf = PdfReader(packet)
526
- output_pdf = PdfWriter()
527
-
528
- # For each page of content, adding it to the template page
529
- # Note: If content has more pages than template, we reuse template page 0 (or last)
530
- template_page = template_pdf.pages[0]
531
-
532
- for page_num in range(len(content_pdf.pages)):
533
- # CORRECT APPROACH:
534
- # 1. Create a blank page of correct size
535
- # 2. Merge template (background)
536
- # 3. Merge content (foreground)
537
-
538
- # Get dimensions from template
539
- width = template_page.mediabox.width
540
- height = template_page.mediabox.height
541
-
542
- # Create base page
543
- output_page = PageObject.create_blank_page(width=width, height=height)
544
-
545
- # Merge template onto it
546
- output_page.merge_page(template_page)
547
-
548
- # Merge generated content onto it
549
- output_page.merge_page(content_pdf.pages[page_num])
550
-
551
- # Add to output
552
- output_pdf.add_page(output_page)
553
-
554
- with open(filename, "wb") as f:
555
- output_pdf.write(f)
556
-
557
- except Exception as e:
558
- print(f"Error merging template: {e}")
559
- # Fallback to saving raw content
560
- with open(filename, "wb") as f:
561
- f.write(packet.getbuffer())
562
-
563
- return filename
564
-
565
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/reporting/pptx_report.py DELETED
@@ -1,57 +0,0 @@
1
- from pptx import Presentation
2
- from pptx.util import Inches, Pt
3
- from pptx.dml.color import RGBColor
4
- from app.schemas.financial import StandardizedDataPackage
5
-
6
- class PPTXReporter:
7
- @staticmethod
8
- def generate(data: StandardizedDataPackage, filename: str):
9
- prs = Presentation()
10
-
11
- # Slide 1: Title Slide
12
- slide_layout = prs.slide_layouts[0] # Title Slide
13
- slide = prs.slides.add_slide(slide_layout)
14
- title = slide.shapes.title
15
- subtitle = slide.placeholders[1]
16
-
17
- title.text = f"Financial Analysis: {data.raw_data.company_name}"
18
- subtitle.text = f"Risk Score: {data.risk_analysis.risk_score} | Visique AI"
19
-
20
- # Slide 2: Key Metrics
21
- slide_layout = prs.slide_layouts[1] # Title and Content
22
- slide = prs.slides.add_slide(slide_layout)
23
- title = slide.shapes.title
24
- title.text = "Key Financial Metrics"
25
-
26
- content = slide.placeholders[1]
27
- text_frame = content.text_frame
28
-
29
- p = text_frame.add_paragraph()
30
- p.text = f"Revenue: ${data.raw_data.income_statement.revenue:,}"
31
- p.level = 0
32
-
33
- p = text_frame.add_paragraph()
34
- p.text = f"Net Margin: {data.kpis.net_margin}%" if data.kpis.net_margin else "Net Margin: N/A"
35
- p.level = 0
36
-
37
- p = text_frame.add_paragraph()
38
- p.text = f"Solvency Risk: {data.risk_analysis.solvency_risk}"
39
- p.level = 0
40
-
41
- # Slide 3: Insights & Pain Points
42
- slide = prs.slides.add_slide(slide_layout)
43
- title = slide.shapes.title
44
- title.text = "AI Insights & Pain Points"
45
-
46
- content = slide.placeholders[1]
47
- text_frame = content.text_frame
48
-
49
- for insight in data.insights:
50
- p = text_frame.add_paragraph()
51
- p.text = insight
52
- p.level = 0
53
- if "Pain Point" in insight:
54
- p.font.color.rgb = RGBColor(255, 0, 0)
55
-
56
- prs.save(filename)
57
- return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/requirements.txt DELETED
@@ -1,29 +0,0 @@
1
- fastapi
2
- uvicorn[standard]
3
- python-multipart
4
- pandas
5
- numpy
6
- pydantic[email]
7
- pydantic-settings
8
- sqlalchemy
9
- alembic
10
- psycopg2-binary
11
- cryptography
12
- python-jose[cryptography]
13
- passlib[bcrypt]
14
- openpyxl
15
- pdfminer.six==20231228
16
- pdfplumber==0.10.3
17
- reportlab
18
- python-pptx
19
- pypdf
20
- stripe
21
- email-validator
22
- argon2-cffi
23
- httpx
24
- # Dolphin PDF Extraction (hybrid parser)
25
- torch>=2.0.0
26
- transformers>=4.40.0
27
- huggingface-hub
28
- Pillow
29
- pdf2image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dolphin/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Standalone Dolphin module for the AI Worker.
3
+ No backend dependencies — everything needed is self-contained here.
4
+ """
5
+
6
+ import os
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ DEFAULT_MODEL_ID = "ByteDance/Dolphin"
12
+
13
+
14
+ def _detect_device() -> str:
15
+ """Auto-detect best available compute device."""
16
+ try:
17
+ import torch
18
+ if torch.cuda.is_available():
19
+ return "cuda"
20
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
21
+ return "mps"
22
+ except ImportError:
23
+ pass
24
+ return "cpu"
25
+
26
+
27
+ def get_device() -> str:
28
+ """Get device from env or auto-detect."""
29
+ device = os.getenv("DOLPHIN_DEVICE", "auto")
30
+ if device != "auto":
31
+ return device
32
+ return _detect_device()
33
+
34
+
35
+ def get_model_path() -> str:
36
+ """Get model path from env or default."""
37
+ return os.getenv("DOLPHIN_MODEL_PATH", DEFAULT_MODEL_ID)
{backend/app/services/ingestion/dolphin → dolphin}/client.py RENAMED
@@ -1,8 +1,6 @@
1
  """
2
- Dolphin Client — Wraps the ByteDance Dolphin-v2 model for document parsing.
3
-
4
- Provides page-level, element-level, and layout parsing capabilities
5
- with automatic device selection (CUDA > MPS > CPU).
6
  """
7
 
8
  import os
@@ -20,9 +18,9 @@ logger = logging.getLogger(__name__)
20
  @dataclass
21
  class DolphinElement:
22
  """A single parsed element from a document page."""
23
- element_type: str # "text", "table", "formula", "figure", "code"
24
- content: str # Markdown or plain text content
25
- bbox: Optional[List[float]] = None # [x1, y1, x2, y2] bounding box
26
  confidence: float = 1.0
27
  page_number: int = 0
28
  metadata: Dict[str, Any] = field(default_factory=dict)
@@ -32,7 +30,7 @@ class DolphinElement:
32
  class DolphinPageResult:
33
  """Result from page-level parsing."""
34
  page_number: int
35
- markdown: str # Full page rendered as Markdown
36
  structured_json: Dict[str, Any] = field(default_factory=dict)
37
  elements: List[DolphinElement] = field(default_factory=list)
38
 
@@ -41,9 +39,9 @@ class DolphinPageResult:
41
  class DolphinLayoutResult:
42
  """Result from layout analysis."""
43
  page_number: int
44
- sections: List[Dict[str, Any]] = field(default_factory=list) # [{type, bbox, label}]
45
- reading_order: List[int] = field(default_factory=list) # Element indices in reading order
46
- doc_type_hint: str = "unknown" # "digital" or "photographed"
47
 
48
 
49
  @dataclass
@@ -58,49 +56,24 @@ class DolphinDocumentResult:
58
  class DolphinClient:
59
  """
60
  High-level client for Dolphin-v2 document parsing.
61
-
62
- Acts as a factory: returns either a local model wrapper (if no API URL)
63
- or a remote client (if API URL is configured).
64
  """
65
 
66
- @staticmethod
67
- def create():
68
- """
69
- Factory method to create the appropriate Dolphin client.
70
-
71
- Returns:
72
- RemoteDolphinClient if DOLPHIN_API_URL is set
73
- LocalDolphinClient (self) otherwise
74
- """
75
- from app.core.config import settings
76
-
77
- if settings.DOLPHIN_API_URL:
78
- from app.services.ingestion.dolphin.remote_client import RemoteDolphinClient
79
- return RemoteDolphinClient()
80
-
81
- return DolphinClient()
82
-
83
  def __init__(
84
  self,
85
  model_path: Optional[str] = None,
86
  device: Optional[str] = None,
87
  max_batch_size: int = 4,
88
  ):
89
- from app.services.ingestion.dolphin import _get_model_path, get_device
90
 
91
- self.model_path = model_path or _get_model_path()
92
  self.device = device or get_device()
93
  self.max_batch_size = max_batch_size
94
  self._model = None
95
  self._processor = None
96
 
97
- logger.info(
98
- f"DolphinClient initialized: model={self.model_path}, device={self.device}"
99
- )
100
-
101
- # ------------------------------------------------------------------
102
- # Lazy model loading
103
- # ------------------------------------------------------------------
104
 
105
  def _ensure_loaded(self):
106
  """Lazy-load model and processor on first use."""
@@ -111,7 +84,7 @@ class DolphinClient:
111
  import torch
112
  from transformers import AutoModelForVision2Seq, AutoProcessor
113
 
114
- logger.info(f"Loading Dolphin-v2 model from {self.model_path}...")
115
 
116
  self._processor = AutoProcessor.from_pretrained(
117
  self.model_path, trust_remote_code=True
@@ -124,31 +97,25 @@ class DolphinClient:
124
  self._model.to(self.device)
125
  self._model.eval()
126
 
127
- logger.info("Dolphin-v2 model loaded successfully")
128
 
129
  except Exception as e:
130
  logger.error(f"Failed to load Dolphin model: {e}")
131
  raise RuntimeError(f"Dolphin model loading failed: {e}") from e
132
 
133
- # ------------------------------------------------------------------
134
- # PDF → Images conversion
135
- # ------------------------------------------------------------------
136
-
137
  @staticmethod
138
  def _pdf_to_images(pdf_path: str) -> list:
139
- """Convert PDF pages to PIL Images for Dolphin processing."""
140
  try:
141
  from pdf2image import convert_from_path
142
- images = convert_from_path(pdf_path, dpi=200)
143
- return images
144
  except ImportError:
145
- # Fallback: use pypdf + Pillow for basic conversion
146
- logger.warning("pdf2image not installed, using fallback renderer")
147
  return DolphinClient._pdf_to_images_fallback(pdf_path)
148
 
149
  @staticmethod
150
  def _pdf_to_images_fallback(pdf_path: str) -> list:
151
- """Fallback PDF image conversion using pypdf."""
152
  from PIL import Image
153
  import io
154
 
@@ -157,61 +124,30 @@ class DolphinClient:
157
  reader = PdfReader(pdf_path)
158
  images = []
159
  for page in reader.pages:
160
- # Extract any embedded images from the page
161
  for img_key in page.images:
162
- img_data = img_key.data
163
- img = Image.open(io.BytesIO(img_data))
164
  images.append(img)
165
- break # One image per page is enough
166
  if not images:
167
- # Create a blank placeholder if no images could be extracted
168
- logger.warning("No images extracted from PDF pages, layout analysis may be limited")
169
  for _ in reader.pages:
170
- img = Image.new("RGB", (1700, 2200), "white")
171
- images.append(img)
172
  return images
173
  except Exception as e:
174
  logger.error(f"Fallback PDF image conversion failed: {e}")
175
  return []
176
 
177
- # ------------------------------------------------------------------
178
- # Core parsing methods
179
- # ------------------------------------------------------------------
180
-
181
  def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
182
- """
183
- Parse a single page image into structured output.
184
-
185
- Args:
186
- image: PIL Image of the page
187
- page_number: Page index (0-based)
188
-
189
- Returns:
190
- DolphinPageResult with markdown and structured elements
191
- """
192
  self._ensure_loaded()
193
-
194
  try:
195
  import torch
196
-
197
- # Prepare input with page-level prompt
198
  prompt = "<page_parsing>"
199
- inputs = self._processor(
200
- images=image, text=prompt, return_tensors="pt"
201
- ).to(self.device)
202
 
203
  with torch.no_grad():
204
- outputs = self._model.generate(
205
- **inputs,
206
- max_new_tokens=4096,
207
- do_sample=False,
208
- )
209
-
210
- result_text = self._processor.batch_decode(
211
- outputs, skip_special_tokens=True
212
- )[0]
213
 
214
- # Parse elements from the result
215
  elements = self._parse_elements_from_text(result_text, page_number)
216
 
217
  return DolphinPageResult(
@@ -220,86 +156,47 @@ class DolphinClient:
220
  structured_json={"raw_output": result_text},
221
  elements=elements,
222
  )
223
-
224
  except Exception as e:
225
  logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
226
- return DolphinPageResult(
227
- page_number=page_number,
228
- markdown="",
229
- elements=[],
230
- )
231
 
232
  def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
233
- """
234
- Analyze layout/structure of a page image.
235
-
236
- Returns section bounding boxes, reading order, and document type hint.
237
- """
238
  self._ensure_loaded()
239
-
240
  try:
241
  import torch
242
-
243
  prompt = "<layout_parsing>"
244
- inputs = self._processor(
245
- images=image, text=prompt, return_tensors="pt"
246
- ).to(self.device)
247
 
248
  with torch.no_grad():
249
- outputs = self._model.generate(
250
- **inputs,
251
- max_new_tokens=2048,
252
- do_sample=False,
253
- )
254
-
255
- result_text = self._processor.batch_decode(
256
- outputs, skip_special_tokens=True
257
- )[0]
258
 
 
259
  sections = self._parse_layout_sections(result_text)
260
- doc_type_hint = "digital" # Dolphin detects this in stage 1
261
 
262
  return DolphinLayoutResult(
263
  page_number=page_number,
264
  sections=sections,
265
  reading_order=list(range(len(sections))),
266
- doc_type_hint=doc_type_hint,
267
  )
268
-
269
  except Exception as e:
270
  logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
271
  return DolphinLayoutResult(page_number=page_number)
272
 
273
  def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
274
- """
275
- Parse an entire PDF document — page-level + layout for all pages.
276
-
277
- This is the main entry point for the hybrid parser.
278
-
279
- Args:
280
- pdf_path: Path to the PDF file
281
-
282
- Returns:
283
- DolphinDocumentResult with all pages parsed
284
- """
285
  images = self._pdf_to_images(pdf_path)
286
  if not images:
287
- logger.warning(f"No page images extracted from {pdf_path}")
288
  return DolphinDocumentResult(total_pages=0)
289
 
290
- pages = []
291
- layouts = []
292
- all_markdown = []
293
 
294
  for i, image in enumerate(images):
295
  logger.debug(f"Parsing page {i + 1}/{len(images)}")
296
-
297
- # Page-level parsing (structured content)
298
  page_result = self.parse_page(image, page_number=i)
299
  pages.append(page_result)
300
  all_markdown.append(page_result.markdown)
301
-
302
- # Layout analysis (structure detection)
303
  layout_result = self.parse_layout(image, page_number=i)
304
  layouts.append(layout_result)
305
 
@@ -310,50 +207,27 @@ class DolphinClient:
310
  total_pages=len(images),
311
  )
312
 
313
- # ------------------------------------------------------------------
314
- # Internal helpers
315
- # ------------------------------------------------------------------
316
-
317
  @staticmethod
318
  def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
319
- """Parse Dolphin's text output into structured DolphinElement objects."""
320
  elements = []
321
  if not text:
322
  return elements
323
 
324
  import re
325
-
326
- # Split by Markdown table blocks
327
  table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
328
-
329
  last_end = 0
330
  for match in table_pattern.finditer(text):
331
- # Text before table
332
  pre_text = text[last_end:match.start()].strip()
333
  if pre_text:
334
- elements.append(DolphinElement(
335
- element_type="text",
336
- content=pre_text,
337
- page_number=page_number,
338
- ))
339
-
340
- # Table element
341
- elements.append(DolphinElement(
342
- element_type="table",
343
- content=match.group(0),
344
- page_number=page_number,
345
- ))
346
  last_end = match.end()
347
 
348
- # Remaining text after last table
349
  remaining = text[last_end:].strip()
350
  if remaining:
351
- elements.append(DolphinElement(
352
- element_type="text",
353
- content=remaining,
354
- page_number=page_number,
355
- ))
356
-
357
  return elements
358
 
359
  @staticmethod
@@ -364,29 +238,17 @@ class DolphinClient:
364
  return sections
365
 
366
  import re
367
-
368
- # Dolphin layout output typically contains bounding box coordinates
369
- # Pattern: <section_type> [x1, y1, x2, y2]
370
- bbox_pattern = re.compile(
371
- r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
372
- )
373
 
374
  for match in bbox_pattern.finditer(text):
375
  sections.append({
376
  "type": match.group(1).strip(),
377
- "bbox": [
378
- int(match.group(2)),
379
- int(match.group(3)),
380
- int(match.group(4)),
381
- int(match.group(5)),
382
- ],
383
  })
384
 
385
- # If no bbox patterns found, treat each line as a section label
386
  if not sections:
387
  for line in text.strip().split("\n"):
388
  line = line.strip()
389
  if line:
390
  sections.append({"type": line, "bbox": []})
391
-
392
  return sections
 
1
  """
2
+ Dolphin Client — Standalone version for the AI Worker.
3
+ No backend dependencies.
 
 
4
  """
5
 
6
  import os
 
18
  @dataclass
19
  class DolphinElement:
20
  """A single parsed element from a document page."""
21
+ element_type: str
22
+ content: str
23
+ bbox: Optional[List[float]] = None
24
  confidence: float = 1.0
25
  page_number: int = 0
26
  metadata: Dict[str, Any] = field(default_factory=dict)
 
30
  class DolphinPageResult:
31
  """Result from page-level parsing."""
32
  page_number: int
33
+ markdown: str
34
  structured_json: Dict[str, Any] = field(default_factory=dict)
35
  elements: List[DolphinElement] = field(default_factory=list)
36
 
 
39
  class DolphinLayoutResult:
40
  """Result from layout analysis."""
41
  page_number: int
42
+ sections: List[Dict[str, Any]] = field(default_factory=list)
43
+ reading_order: List[int] = field(default_factory=list)
44
+ doc_type_hint: str = "unknown"
45
 
46
 
47
  @dataclass
 
56
  class DolphinClient:
57
  """
58
  High-level client for Dolphin-v2 document parsing.
59
+ Standalone version — no backend package dependencies.
 
 
60
  """
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def __init__(
63
  self,
64
  model_path: Optional[str] = None,
65
  device: Optional[str] = None,
66
  max_batch_size: int = 4,
67
  ):
68
+ from dolphin import get_model_path, get_device
69
 
70
+ self.model_path = model_path or get_model_path()
71
  self.device = device or get_device()
72
  self.max_batch_size = max_batch_size
73
  self._model = None
74
  self._processor = None
75
 
76
+ logger.info(f"DolphinClient initialized: model={self.model_path}, device={self.device}")
 
 
 
 
 
 
77
 
78
  def _ensure_loaded(self):
79
  """Lazy-load model and processor on first use."""
 
84
  import torch
85
  from transformers import AutoModelForVision2Seq, AutoProcessor
86
 
87
+ logger.info(f"Loading Dolphin model from {self.model_path}...")
88
 
89
  self._processor = AutoProcessor.from_pretrained(
90
  self.model_path, trust_remote_code=True
 
97
  self._model.to(self.device)
98
  self._model.eval()
99
 
100
+ logger.info("Dolphin model loaded successfully")
101
 
102
  except Exception as e:
103
  logger.error(f"Failed to load Dolphin model: {e}")
104
  raise RuntimeError(f"Dolphin model loading failed: {e}") from e
105
 
 
 
 
 
106
  @staticmethod
107
  def _pdf_to_images(pdf_path: str) -> list:
108
+ """Convert PDF pages to PIL Images."""
109
  try:
110
  from pdf2image import convert_from_path
111
+ return convert_from_path(pdf_path, dpi=200)
 
112
  except ImportError:
113
+ logger.warning("pdf2image not installed, using fallback")
 
114
  return DolphinClient._pdf_to_images_fallback(pdf_path)
115
 
116
  @staticmethod
117
  def _pdf_to_images_fallback(pdf_path: str) -> list:
118
+ """Fallback PDF to image conversion."""
119
  from PIL import Image
120
  import io
121
 
 
124
  reader = PdfReader(pdf_path)
125
  images = []
126
  for page in reader.pages:
 
127
  for img_key in page.images:
128
+ img = Image.open(io.BytesIO(img_key.data))
 
129
  images.append(img)
130
+ break
131
  if not images:
 
 
132
  for _ in reader.pages:
133
+ images.append(Image.new("RGB", (1700, 2200), "white"))
 
134
  return images
135
  except Exception as e:
136
  logger.error(f"Fallback PDF image conversion failed: {e}")
137
  return []
138
 
 
 
 
 
139
  def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
140
+ """Parse a single page image into structured output."""
 
 
 
 
 
 
 
 
 
141
  self._ensure_loaded()
 
142
  try:
143
  import torch
 
 
144
  prompt = "<page_parsing>"
145
+ inputs = self._processor(images=image, text=prompt, return_tensors="pt").to(self.device)
 
 
146
 
147
  with torch.no_grad():
148
+ outputs = self._model.generate(**inputs, max_new_tokens=4096, do_sample=False)
 
 
 
 
 
 
 
 
149
 
150
+ result_text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0]
151
  elements = self._parse_elements_from_text(result_text, page_number)
152
 
153
  return DolphinPageResult(
 
156
  structured_json={"raw_output": result_text},
157
  elements=elements,
158
  )
 
159
  except Exception as e:
160
  logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
161
+ return DolphinPageResult(page_number=page_number, markdown="", elements=[])
 
 
 
 
162
 
163
  def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
164
+ """Analyze layout/structure of a page image."""
 
 
 
 
165
  self._ensure_loaded()
 
166
  try:
167
  import torch
 
168
  prompt = "<layout_parsing>"
169
+ inputs = self._processor(images=image, text=prompt, return_tensors="pt").to(self.device)
 
 
170
 
171
  with torch.no_grad():
172
+ outputs = self._model.generate(**inputs, max_new_tokens=2048, do_sample=False)
 
 
 
 
 
 
 
 
173
 
174
+ result_text = self._processor.batch_decode(outputs, skip_special_tokens=True)[0]
175
  sections = self._parse_layout_sections(result_text)
 
176
 
177
  return DolphinLayoutResult(
178
  page_number=page_number,
179
  sections=sections,
180
  reading_order=list(range(len(sections))),
181
+ doc_type_hint="digital",
182
  )
 
183
  except Exception as e:
184
  logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
185
  return DolphinLayoutResult(page_number=page_number)
186
 
187
  def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
188
+ """Parse an entire PDF document."""
 
 
 
 
 
 
 
 
 
 
189
  images = self._pdf_to_images(pdf_path)
190
  if not images:
 
191
  return DolphinDocumentResult(total_pages=0)
192
 
193
+ pages, layouts, all_markdown = [], [], []
 
 
194
 
195
  for i, image in enumerate(images):
196
  logger.debug(f"Parsing page {i + 1}/{len(images)}")
 
 
197
  page_result = self.parse_page(image, page_number=i)
198
  pages.append(page_result)
199
  all_markdown.append(page_result.markdown)
 
 
200
  layout_result = self.parse_layout(image, page_number=i)
201
  layouts.append(layout_result)
202
 
 
207
  total_pages=len(images),
208
  )
209
 
 
 
 
 
210
  @staticmethod
211
  def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
212
+ """Parse Dolphin's text output into structured elements."""
213
  elements = []
214
  if not text:
215
  return elements
216
 
217
  import re
 
 
218
  table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
219
+
220
  last_end = 0
221
  for match in table_pattern.finditer(text):
 
222
  pre_text = text[last_end:match.start()].strip()
223
  if pre_text:
224
+ elements.append(DolphinElement(element_type="text", content=pre_text, page_number=page_number))
225
+ elements.append(DolphinElement(element_type="table", content=match.group(0), page_number=page_number))
 
 
 
 
 
 
 
 
 
 
226
  last_end = match.end()
227
 
 
228
  remaining = text[last_end:].strip()
229
  if remaining:
230
+ elements.append(DolphinElement(element_type="text", content=remaining, page_number=page_number))
 
 
 
 
 
231
  return elements
232
 
233
  @staticmethod
 
238
  return sections
239
 
240
  import re
241
+ bbox_pattern = re.compile(r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]")
 
 
 
 
 
242
 
243
  for match in bbox_pattern.finditer(text):
244
  sections.append({
245
  "type": match.group(1).strip(),
246
+ "bbox": [int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5))],
 
 
 
 
 
247
  })
248
 
 
249
  if not sections:
250
  for line in text.strip().split("\n"):
251
  line = line.strip()
252
  if line:
253
  sections.append({"type": line, "bbox": []})
 
254
  return sections