yoursdvniel commited on
Commit
f1ac8ee
·
verified ·
1 Parent(s): 9fed675

Revised sme-dump endpoint.

Browse files
Files changed (1) hide show
  1. main.py +491 -0
main.py CHANGED
@@ -520,6 +520,345 @@ def _normalize_outline_json(ai_result: Dict[str, Any]) -> Dict[str, Any]:
520
  ],
521
  }
522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  # -- route ---------------------------------------------------------------
524
 
525
  @app.route('/chat', methods=['POST'])
@@ -967,6 +1306,158 @@ def generate_course_outline():
967
  "error": "Failed to generate course outline from file"
968
  }), 500
969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
 
971
  if __name__ == "__main__":
972
  app.run(host="0.0.0.0", port=7860)
 
520
  ],
521
  }
522
 
523
+ # --- SME Intake helpers -------------------------------------------------
524
+
525
+ ALLOWED_SME_DOC_EXTENSIONS = {"pdf", "docx", "txt", "png", "jpg", "jpeg"}
526
+ MAX_SME_DOC_TEXT_CHARS = 45000
527
+
528
+
529
+ def _allowed_sme_doc(filename: str) -> bool:
530
+ if not filename or "." not in filename:
531
+ return False
532
+ return filename.rsplit(".", 1)[1].lower() in ALLOWED_SME_DOC_EXTENSIONS
533
+
534
+
535
+ def _extract_text_from_txt_bytes(file_bytes: bytes) -> str:
536
+ try:
537
+ return _clean_extracted_text(file_bytes.decode("utf-8", errors="ignore"))
538
+ except Exception:
539
+ return ""
540
+
541
+
542
+ def _extract_sme_doc_text(filename: str, file_bytes: bytes) -> str:
543
+ ext = filename.rsplit(".", 1)[1].lower()
544
+
545
+ if ext == "pdf":
546
+ return _extract_text_from_pdf_bytes(file_bytes)
547
+
548
+ if ext == "docx":
549
+ return _extract_text_from_docx_bytes(file_bytes)
550
+
551
+ if ext == "txt":
552
+ return _extract_text_from_txt_bytes(file_bytes)
553
+
554
+ # For images, we are not doing OCR here.
555
+ # Send metadata to AI, but text extraction stays blank.
556
+ if ext in ["png", "jpg", "jpeg"]:
557
+ return ""
558
+
559
+ return ""
560
+
561
+
562
+ def _fetch_active_programs(company_code: str) -> List[Dict[str, Any]]:
563
+ try:
564
+ docs = (
565
+ db.collection("programs")
566
+ .where("companyCode", "==", company_code)
567
+ .stream()
568
+ )
569
+
570
+ programs = []
571
+ for doc_snap in docs:
572
+ data = doc_snap.to_dict() or {}
573
+ status = _norm(data.get("status") or data.get("programStatus"))
574
+ if status and status not in ["active", "open", "running"]:
575
+ continue
576
+
577
+ programs.append({
578
+ "id": doc_snap.id,
579
+ "name": data.get("name") or data.get("programName") or data.get("title") or "",
580
+ "description": data.get("description") or "",
581
+ "sector": data.get("sector") or data.get("targetSector") or "",
582
+ "stage": data.get("stage") or data.get("targetStage") or "",
583
+ "branch": data.get("assignedBranch") or data.get("branch") or "",
584
+ "requirements": data.get("requirements") or [],
585
+ })
586
+
587
+ return programs[:30]
588
+ except Exception as e:
589
+ print("fetch_active_programs_failed:", e)
590
+ return []
591
+
592
+
593
+ def _fetch_intervention_catalog(company_code: str) -> List[Dict[str, Any]]:
594
+ try:
595
+ docs = (
596
+ db.collection("interventions")
597
+ .where("companyCode", "==", company_code)
598
+ .stream()
599
+ )
600
+
601
+ interventions = []
602
+ for doc_snap in docs:
603
+ data = doc_snap.to_dict() or {}
604
+ status = _norm(data.get("status"))
605
+ if status and status not in ["active", "enabled", "approved"]:
606
+ continue
607
+
608
+ interventions.append({
609
+ "id": doc_snap.id,
610
+ "title": data.get("title") or data.get("interventionTitle") or data.get("name") or "",
611
+ "department": data.get("departmentName") or data.get("areaOfSupport") or data.get("department") or "",
612
+ "description": data.get("description") or data.get("objective") or "",
613
+ "tags": data.get("tags") or [],
614
+ "stage": data.get("stage") or data.get("businessStage") or "",
615
+ })
616
+
617
+ return interventions[:80]
618
+ except Exception as e:
619
+ print("fetch_intervention_catalog_failed:", e)
620
+ return []
621
+
622
+
623
+ def _build_sme_intake_prompt(payload: Dict[str, Any]) -> str:
624
+ return f"""
625
+ You are analysing an unregistered SME intake for an incubation/support platform.
626
+
627
+ Return STRICT JSON only with this exact shape:
628
+ {{
629
+ "detectedLanguages": ["English"],
630
+ "confidence": 0,
631
+ "extractedProfile": {{
632
+ "businessName": "",
633
+ "beneficiaryName": "",
634
+ "contactPerson": "",
635
+ "email": "",
636
+ "phone": "",
637
+ "registrationNumber": "",
638
+ "taxNumber": "",
639
+ "sector": "",
640
+ "province": "",
641
+ "city": "",
642
+ "yearsOperating": "",
643
+ "employeeCount": "",
644
+ "monthlyRevenue": "",
645
+ "mainChallenges": []
646
+ }},
647
+ "extractedApplication": {{
648
+ "motivation": "",
649
+ "challenges": "",
650
+ "businessDescription": "",
651
+ "supportNeeded": [],
652
+ "documentsMentioned": []
653
+ }},
654
+ "documentFindings": [
655
+ {{
656
+ "documentType": "id_document|cipc|tax_pin|other",
657
+ "filename": "",
658
+ "extractedFields": {{}},
659
+ "confidence": 0,
660
+ "warnings": []
661
+ }}
662
+ ],
663
+ "missingFields": [
664
+ {{
665
+ "field": "",
666
+ "label": "",
667
+ "question": "",
668
+ "reason": ""
669
+ }}
670
+ ],
671
+ "missingDocuments": [
672
+ {{
673
+ "type": "id_document|cipc|tax_pin|other",
674
+ "reason": ""
675
+ }}
676
+ ],
677
+ "nextQuestion": "",
678
+ "decision": {{
679
+ "stage": "idea|startup|early_growth|struggling|scaling|market_ready",
680
+ "urgencyLevel": "low|medium|high|urgent",
681
+ "urgencyScore": 0,
682
+ "riskLevel": "low|medium|high",
683
+ "recommendedProgramId": "",
684
+ "recommendedProgramName": "",
685
+ "recommendedInterventions": [
686
+ {{
687
+ "title": "",
688
+ "department": "",
689
+ "urgency": "low|medium|high|urgent",
690
+ "reason": ""
691
+ }}
692
+ ],
693
+ "summary": "",
694
+ "classificationReasons": [],
695
+ "redFlags": [],
696
+ "growthSignals": [],
697
+ "benefitsOfProgram": [],
698
+ "benefitsOfAgents": []
699
+ }},
700
+ "warnings": []
701
+ }}
702
+
703
+ Rules:
704
+ - Use the SME story and uploaded document text only.
705
+ - Do not invent registration numbers, tax numbers, IDs, turnover, employees, or compliance status.
706
+ - If a field is missing, add it to missingFields using a friendly conversational question.
707
+ - Required basic documents are: Director ID, CIPC document, SARS Tax PIN.
708
+ - If a required document is not uploaded or cannot be identified, add it to missingDocuments.
709
+ - Classify the SME stage:
710
+ - idea: concept only, no operating history
711
+ - startup: newly operating or still proving model
712
+ - early_growth: some traction, some customers/revenue
713
+ - struggling: stagnant revenue, compliance gaps, cashflow pressure, no growth, operational distress
714
+ - scaling: growing and needing structured support
715
+ - market_ready: compliant and ready for procurement/market linkage
716
+ - Urgency must reflect intervention need, not emotional tone.
717
+ - urgent/high urgency should be used where compliance, tax, funding, payroll, legal, safety, or survival risks are clear.
718
+ - Recommend interventions only from the intervention catalog where possible.
719
+ - Recommend one program from availablePrograms where possible.
720
+ - If no program fits, leave recommendedProgramId blank and explain in summary.
721
+ - benefitsOfProgram must explain why structured incubation helps.
722
+ - benefitsOfAgents must explain why direct agent support helps without programme onboarding.
723
+ - Keep text professional and concise.
724
+ - Return JSON only.
725
+
726
+ Payload:
727
+ {json.dumps(payload, ensure_ascii=False)}
728
+ """.strip()
729
+
730
+
731
+ def _normalize_sme_intake_result(raw: Dict[str, Any]) -> Dict[str, Any]:
732
+ decision = raw.get("decision") or {}
733
+
734
+ allowed_stages = ["idea", "startup", "early_growth", "struggling", "scaling", "market_ready"]
735
+ allowed_urgency = ["low", "medium", "high", "urgent"]
736
+ allowed_risk = ["low", "medium", "high"]
737
+
738
+ stage = _norm(decision.get("stage"))
739
+ urgency = _norm(decision.get("urgencyLevel"))
740
+ risk = _norm(decision.get("riskLevel"))
741
+
742
+ if stage not in allowed_stages:
743
+ stage = "startup"
744
+
745
+ if urgency not in allowed_urgency:
746
+ urgency = "medium"
747
+
748
+ if risk not in allowed_risk:
749
+ risk = "medium"
750
+
751
+ interventions = []
752
+ for item in decision.get("recommendedInterventions") or []:
753
+ item_urgency = _norm(item.get("urgency"))
754
+ if item_urgency not in allowed_urgency:
755
+ item_urgency = urgency
756
+
757
+ title = str(item.get("title") or "").strip()
758
+ if not title:
759
+ continue
760
+
761
+ interventions.append({
762
+ "title": title,
763
+ "department": str(item.get("department") or "").strip(),
764
+ "urgency": item_urgency,
765
+ "reason": str(item.get("reason") or "").strip(),
766
+ })
767
+
768
+ missing_fields = []
769
+ for item in raw.get("missingFields") or []:
770
+ field = str(item.get("field") or "").strip()
771
+ question = str(item.get("question") or "").strip()
772
+ if not field or not question:
773
+ continue
774
+
775
+ missing_fields.append({
776
+ "field": field,
777
+ "label": str(item.get("label") or field).strip(),
778
+ "question": question,
779
+ "reason": str(item.get("reason") or "").strip(),
780
+ })
781
+
782
+ missing_documents = []
783
+ for item in raw.get("missingDocuments") or []:
784
+ doc_type = str(item.get("type") or "").strip()
785
+ if not doc_type:
786
+ continue
787
+
788
+ missing_documents.append({
789
+ "type": doc_type,
790
+ "reason": str(item.get("reason") or "").strip(),
791
+ })
792
+
793
+ document_findings = []
794
+ for item in raw.get("documentFindings") or []:
795
+ document_findings.append({
796
+ "documentType": str(item.get("documentType") or "other").strip(),
797
+ "filename": str(item.get("filename") or "").strip(),
798
+ "extractedFields": item.get("extractedFields") or {},
799
+ "confidence": _clamp_pct(item.get("confidence")),
800
+ "warnings": [
801
+ str(x).strip()
802
+ for x in item.get("warnings", [])
803
+ if str(x).strip()
804
+ ],
805
+ })
806
+
807
+ return {
808
+ "detectedLanguages": [
809
+ str(x).strip()
810
+ for x in raw.get("detectedLanguages", [])
811
+ if str(x).strip()
812
+ ],
813
+ "confidence": _clamp_pct(raw.get("confidence")),
814
+ "extractedProfile": raw.get("extractedProfile") or {},
815
+ "extractedApplication": raw.get("extractedApplication") or {},
816
+ "documentFindings": document_findings,
817
+ "missingFields": missing_fields[:12],
818
+ "missingDocuments": missing_documents[:10],
819
+ "nextQuestion": str(raw.get("nextQuestion") or "").strip(),
820
+ "decision": {
821
+ "stage": stage,
822
+ "urgencyLevel": urgency,
823
+ "urgencyScore": _clamp_pct(decision.get("urgencyScore")),
824
+ "riskLevel": risk,
825
+ "recommendedProgramId": str(decision.get("recommendedProgramId") or "").strip(),
826
+ "recommendedProgramName": str(decision.get("recommendedProgramName") or "").strip(),
827
+ "recommendedInterventions": interventions[:10],
828
+ "summary": str(decision.get("summary") or "").strip(),
829
+ "classificationReasons": [
830
+ str(x).strip()
831
+ for x in decision.get("classificationReasons", [])
832
+ if str(x).strip()
833
+ ][:10],
834
+ "redFlags": [
835
+ str(x).strip()
836
+ for x in decision.get("redFlags", [])
837
+ if str(x).strip()
838
+ ][:10],
839
+ "growthSignals": [
840
+ str(x).strip()
841
+ for x in decision.get("growthSignals", [])
842
+ if str(x).strip()
843
+ ][:10],
844
+ "benefitsOfProgram": [
845
+ str(x).strip()
846
+ for x in decision.get("benefitsOfProgram", [])
847
+ if str(x).strip()
848
+ ][:6],
849
+ "benefitsOfAgents": [
850
+ str(x).strip()
851
+ for x in decision.get("benefitsOfAgents", [])
852
+ if str(x).strip()
853
+ ][:6],
854
+ },
855
+ "warnings": [
856
+ str(x).strip()
857
+ for x in raw.get("warnings", [])
858
+ if str(x).strip()
859
+ ],
860
+ }
861
+
862
  # -- route ---------------------------------------------------------------
863
 
864
  @app.route('/chat', methods=['POST'])
 
1306
  "error": "Failed to generate course outline from file"
1307
  }), 500
1308
 
1309
+ @app.route('/analyze-sme-application-intake', methods=['POST'])
1310
+ def analyze_sme_application_intake():
1311
+ """
1312
+ Multipart form-data endpoint for unregistered SME intake.
1313
+
1314
+ Expected form fields:
1315
+ - mode: initial_review | final_decision
1316
+ - companyCode
1317
+ - userId optional
1318
+ - contactName
1319
+ - contactEmail
1320
+ - contactPhone
1321
+ - rawStory
1322
+ - missingAnswersJson optional
1323
+ - requiredDocumentsJson optional
1324
+ - files[] optional
1325
+
1326
+ Response:
1327
+ {
1328
+ detectedLanguages,
1329
+ confidence,
1330
+ extractedProfile,
1331
+ extractedApplication,
1332
+ documentFindings,
1333
+ missingFields,
1334
+ missingDocuments,
1335
+ nextQuestion,
1336
+ decision,
1337
+ warnings
1338
+ }
1339
+ """
1340
+ try:
1341
+ mode = request.form.get("mode") or "initial_review"
1342
+ company_code = request.form.get("companyCode")
1343
+ user_id = request.form.get("userId") or ""
1344
+ contact_name = request.form.get("contactName") or ""
1345
+ contact_email = request.form.get("email") or request.form.get("contactEmail") or ""
1346
+ contact_phone = request.form.get("contactPhone") or ""
1347
+ raw_story = (request.form.get("rawStory") or "").strip()
1348
+
1349
+ if not company_code:
1350
+ return jsonify({"error": "Missing companyCode"}), 400
1351
+
1352
+ if not raw_story:
1353
+ return jsonify({"error": "Missing rawStory"}), 400
1354
+
1355
+ try:
1356
+ missing_answers = json.loads(request.form.get("missingAnswersJson") or "{}")
1357
+ except Exception:
1358
+ missing_answers = {}
1359
+
1360
+ try:
1361
+ required_documents = json.loads(request.form.get("requiredDocumentsJson") or "[]")
1362
+ except Exception:
1363
+ required_documents = []
1364
+
1365
+ uploaded_files = request.files.getlist("files")
1366
+ document_payloads = []
1367
+
1368
+ for uploaded in uploaded_files:
1369
+ filename = uploaded.filename or ""
1370
+
1371
+ if not filename:
1372
+ continue
1373
+
1374
+ if not _allowed_sme_doc(filename):
1375
+ document_payloads.append({
1376
+ "filename": filename,
1377
+ "contentType": uploaded.content_type,
1378
+ "extractedText": "",
1379
+ "warnings": ["Unsupported document type."]
1380
+ })
1381
+ continue
1382
+
1383
+ file_bytes = uploaded.read()
1384
+
1385
+ if not file_bytes:
1386
+ document_payloads.append({
1387
+ "filename": filename,
1388
+ "contentType": uploaded.content_type,
1389
+ "extractedText": "",
1390
+ "warnings": ["Uploaded file was empty."]
1391
+ })
1392
+ continue
1393
+
1394
+ extracted_text = _extract_sme_doc_text(filename, file_bytes)
1395
+ truncated_text = _truncate_source_text(extracted_text, MAX_SME_DOC_TEXT_CHARS)
1396
+
1397
+ warnings = []
1398
+ ext = filename.rsplit(".", 1)[1].lower()
1399
+ if ext in ["png", "jpg", "jpeg"]:
1400
+ warnings.append("Image OCR is not enabled on this endpoint yet.")
1401
+ if extracted_text and len(truncated_text) < len(extracted_text):
1402
+ warnings.append("Document text was truncated before AI analysis.")
1403
+ if not extracted_text and ext not in ["png", "jpg", "jpeg"]:
1404
+ warnings.append("No readable text could be extracted from this document.")
1405
+
1406
+ document_payloads.append({
1407
+ "filename": filename,
1408
+ "contentType": uploaded.content_type,
1409
+ "extractedText": truncated_text,
1410
+ "warnings": warnings
1411
+ })
1412
+
1413
+ available_programs = _fetch_active_programs(company_code)
1414
+ intervention_catalog = _fetch_intervention_catalog(company_code)
1415
+
1416
+ payload = {
1417
+ "mode": mode,
1418
+ "companyCode": company_code,
1419
+ "userId": user_id,
1420
+ "contact": {
1421
+ "name": contact_name,
1422
+ "email": contact_email,
1423
+ "phone": contact_phone,
1424
+ },
1425
+ "rawStory": raw_story,
1426
+ "missingAnswers": missing_answers,
1427
+ "requiredDocuments": required_documents,
1428
+ "uploadedDocuments": document_payloads,
1429
+ "availablePrograms": available_programs,
1430
+ "interventionCatalog": intervention_catalog,
1431
+ "analysisDate": datetime.utcnow().isoformat(),
1432
+ }
1433
+
1434
+ system_msg = {
1435
+ "role": "system",
1436
+ "content": (
1437
+ "You analyse SME intake information for an incubation platform. "
1438
+ "You return strict JSON only. "
1439
+ "You must not invent official facts, registration numbers, tax numbers, "
1440
+ "document statuses, revenue, employees, or compliance claims."
1441
+ )
1442
+ }
1443
+
1444
+ user_msg = {
1445
+ "role": "user",
1446
+ "content": _build_sme_intake_prompt(payload)
1447
+ }
1448
+
1449
+ ai_raw = ask_gpt([system_msg, user_msg])
1450
+ ai_result = _extract_json_block(ai_raw)
1451
+ normalized = _normalize_sme_intake_result(ai_result)
1452
+
1453
+ return jsonify(to_jsonable(normalized))
1454
+
1455
+ except Exception as e:
1456
+ print("analyze_sme_application_intake_failed:", e)
1457
+ return jsonify({
1458
+ "error": "Failed to analyse SME application intake"
1459
+ }), 500
1460
+
1461
 
1462
  if __name__ == "__main__":
1463
  app.run(host="0.0.0.0", port=7860)