Spaces:

yoursdvniel
/

SmartInc-API

Sleeping

App Files Files Community

yoursdvniel commited on Apr 29

Commit

f1ac8ee

verified ·

1 Parent(s): 9fed675

Revised sme-dump endpoint.

Browse files

Files changed (1) hide show

main.py +491 -0

main.py CHANGED Viewed

@@ -520,6 +520,345 @@ def _normalize_outline_json(ai_result: Dict[str, Any]) -> Dict[str, Any]:
         ],
     }
 # -- route ---------------------------------------------------------------
 @app.route('/chat', methods=['POST'])
@@ -967,6 +1306,158 @@ def generate_course_outline():
             "error": "Failed to generate course outline from file"
         }), 500
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

         ],
     }
+# --- SME Intake helpers -------------------------------------------------
+ALLOWED_SME_DOC_EXTENSIONS = {"pdf", "docx", "txt", "png", "jpg", "jpeg"}
+MAX_SME_DOC_TEXT_CHARS = 45000
+def _allowed_sme_doc(filename: str) -> bool:
+    if not filename or "." not in filename:
+        return False
+    return filename.rsplit(".", 1)[1].lower() in ALLOWED_SME_DOC_EXTENSIONS
+def _extract_text_from_txt_bytes(file_bytes: bytes) -> str:
+    try:
+        return _clean_extracted_text(file_bytes.decode("utf-8", errors="ignore"))
+    except Exception:
+        return ""
+def _extract_sme_doc_text(filename: str, file_bytes: bytes) -> str:
+    ext = filename.rsplit(".", 1)[1].lower()
+    if ext == "pdf":
+        return _extract_text_from_pdf_bytes(file_bytes)
+    if ext == "docx":
+        return _extract_text_from_docx_bytes(file_bytes)
+    if ext == "txt":
+        return _extract_text_from_txt_bytes(file_bytes)
+    # For images, we are not doing OCR here.
+    # Send metadata to AI, but text extraction stays blank.
+    if ext in ["png", "jpg", "jpeg"]:
+        return ""
+    return ""
+def _fetch_active_programs(company_code: str) -> List[Dict[str, Any]]:
+    try:
+        docs = (
+            db.collection("programs")
+            .where("companyCode", "==", company_code)
+            .stream()
+        )
+        programs = []
+        for doc_snap in docs:
+            data = doc_snap.to_dict() or {}
+            status = _norm(data.get("status") or data.get("programStatus"))
+            if status and status not in ["active", "open", "running"]:
+                continue
+            programs.append({
+                "id": doc_snap.id,
+                "name": data.get("name") or data.get("programName") or data.get("title") or "",
+                "description": data.get("description") or "",
+                "sector": data.get("sector") or data.get("targetSector") or "",
+                "stage": data.get("stage") or data.get("targetStage") or "",
+                "branch": data.get("assignedBranch") or data.get("branch") or "",
+                "requirements": data.get("requirements") or [],
+            })
+        return programs[:30]
+    except Exception as e:
+        print("fetch_active_programs_failed:", e)
+        return []
+def _fetch_intervention_catalog(company_code: str) -> List[Dict[str, Any]]:
+    try:
+        docs = (
+            db.collection("interventions")
+            .where("companyCode", "==", company_code)
+            .stream()
+        )
+        interventions = []
+        for doc_snap in docs:
+            data = doc_snap.to_dict() or {}
+            status = _norm(data.get("status"))
+            if status and status not in ["active", "enabled", "approved"]:
+                continue
+            interventions.append({
+                "id": doc_snap.id,
+                "title": data.get("title") or data.get("interventionTitle") or data.get("name") or "",
+                "department": data.get("departmentName") or data.get("areaOfSupport") or data.get("department") or "",
+                "description": data.get("description") or data.get("objective") or "",
+                "tags": data.get("tags") or [],
+                "stage": data.get("stage") or data.get("businessStage") or "",
+            })
+        return interventions[:80]
+    except Exception as e:
+        print("fetch_intervention_catalog_failed:", e)
+        return []
+def _build_sme_intake_prompt(payload: Dict[str, Any]) -> str:
+    return f"""
+You are analysing an unregistered SME intake for an incubation/support platform.
+Return STRICT JSON only with this exact shape:
+{{
+  "detectedLanguages": ["English"],
+  "confidence": 0,
+  "extractedProfile": {{
+    "businessName": "",
+    "beneficiaryName": "",
+    "contactPerson": "",
+    "email": "",
+    "phone": "",
+    "registrationNumber": "",
+    "taxNumber": "",
+    "sector": "",
+    "province": "",
+    "city": "",
+    "yearsOperating": "",
+    "employeeCount": "",
+    "monthlyRevenue": "",
+    "mainChallenges": []
+  }},
+  "extractedApplication": {{
+    "motivation": "",
+    "challenges": "",
+    "businessDescription": "",
+    "supportNeeded": [],
+    "documentsMentioned": []
+  }},
+  "documentFindings": [
+    {{
+      "documentType": "id_document|cipc|tax_pin|other",
+      "filename": "",
+      "extractedFields": {{}},
+      "confidence": 0,
+      "warnings": []
+    }}
+  ],
+  "missingFields": [
+    {{
+      "field": "",
+      "label": "",
+      "question": "",
+      "reason": ""
+    }}
+  ],
+  "missingDocuments": [
+    {{
+      "type": "id_document|cipc|tax_pin|other",
+      "reason": ""
+    }}
+  ],
+  "nextQuestion": "",
+  "decision": {{
+    "stage": "idea|startup|early_growth|struggling|scaling|market_ready",
+    "urgencyLevel": "low|medium|high|urgent",
+    "urgencyScore": 0,
+    "riskLevel": "low|medium|high",
+    "recommendedProgramId": "",
+    "recommendedProgramName": "",
+    "recommendedInterventions": [
+      {{
+        "title": "",
+        "department": "",
+        "urgency": "low|medium|high|urgent",
+        "reason": ""
+      }}
+    ],
+    "summary": "",
+    "classificationReasons": [],
+    "redFlags": [],
+    "growthSignals": [],
+    "benefitsOfProgram": [],
+    "benefitsOfAgents": []
+  }},
+  "warnings": []
+}}
+Rules:
+- Use the SME story and uploaded document text only.
+- Do not invent registration numbers, tax numbers, IDs, turnover, employees, or compliance status.
+- If a field is missing, add it to missingFields using a friendly conversational question.
+- Required basic documents are: Director ID, CIPC document, SARS Tax PIN.
+- If a required document is not uploaded or cannot be identified, add it to missingDocuments.
+- Classify the SME stage:
+  - idea: concept only, no operating history
+  - startup: newly operating or still proving model
+  - early_growth: some traction, some customers/revenue
+  - struggling: stagnant revenue, compliance gaps, cashflow pressure, no growth, operational distress
+  - scaling: growing and needing structured support
+  - market_ready: compliant and ready for procurement/market linkage
+- Urgency must reflect intervention need, not emotional tone.
+- urgent/high urgency should be used where compliance, tax, funding, payroll, legal, safety, or survival risks are clear.
+- Recommend interventions only from the intervention catalog where possible.
+- Recommend one program from availablePrograms where possible.
+- If no program fits, leave recommendedProgramId blank and explain in summary.
+- benefitsOfProgram must explain why structured incubation helps.
+- benefitsOfAgents must explain why direct agent support helps without programme onboarding.
+- Keep text professional and concise.
+- Return JSON only.
+Payload:
+{json.dumps(payload, ensure_ascii=False)}
+""".strip()
+def _normalize_sme_intake_result(raw: Dict[str, Any]) -> Dict[str, Any]:
+    decision = raw.get("decision") or {}
+    allowed_stages = ["idea", "startup", "early_growth", "struggling", "scaling", "market_ready"]
+    allowed_urgency = ["low", "medium", "high", "urgent"]
+    allowed_risk = ["low", "medium", "high"]
+    stage = _norm(decision.get("stage"))
+    urgency = _norm(decision.get("urgencyLevel"))
+    risk = _norm(decision.get("riskLevel"))
+    if stage not in allowed_stages:
+        stage = "startup"
+    if urgency not in allowed_urgency:
+        urgency = "medium"
+    if risk not in allowed_risk:
+        risk = "medium"
+    interventions = []
+    for item in decision.get("recommendedInterventions") or []:
+        item_urgency = _norm(item.get("urgency"))
+        if item_urgency not in allowed_urgency:
+            item_urgency = urgency
+        title = str(item.get("title") or "").strip()
+        if not title:
+            continue
+        interventions.append({
+            "title": title,
+            "department": str(item.get("department") or "").strip(),
+            "urgency": item_urgency,
+            "reason": str(item.get("reason") or "").strip(),
+        })
+    missing_fields = []
+    for item in raw.get("missingFields") or []:
+        field = str(item.get("field") or "").strip()
+        question = str(item.get("question") or "").strip()
+        if not field or not question:
+            continue
+        missing_fields.append({
+            "field": field,
+            "label": str(item.get("label") or field).strip(),
+            "question": question,
+            "reason": str(item.get("reason") or "").strip(),
+        })
+    missing_documents = []
+    for item in raw.get("missingDocuments") or []:
+        doc_type = str(item.get("type") or "").strip()
+        if not doc_type:
+            continue
+        missing_documents.append({
+            "type": doc_type,
+            "reason": str(item.get("reason") or "").strip(),
+        })
+    document_findings = []
+    for item in raw.get("documentFindings") or []:
+        document_findings.append({
+            "documentType": str(item.get("documentType") or "other").strip(),
+            "filename": str(item.get("filename") or "").strip(),
+            "extractedFields": item.get("extractedFields") or {},
+            "confidence": _clamp_pct(item.get("confidence")),
+            "warnings": [
+                str(x).strip()
+                for x in item.get("warnings", [])
+                if str(x).strip()
+            ],
+        })
+    return {
+        "detectedLanguages": [
+            str(x).strip()
+            for x in raw.get("detectedLanguages", [])
+            if str(x).strip()
+        ],
+        "confidence": _clamp_pct(raw.get("confidence")),
+        "extractedProfile": raw.get("extractedProfile") or {},
+        "extractedApplication": raw.get("extractedApplication") or {},
+        "documentFindings": document_findings,
+        "missingFields": missing_fields[:12],
+        "missingDocuments": missing_documents[:10],
+        "nextQuestion": str(raw.get("nextQuestion") or "").strip(),
+        "decision": {
+            "stage": stage,
+            "urgencyLevel": urgency,
+            "urgencyScore": _clamp_pct(decision.get("urgencyScore")),
+            "riskLevel": risk,
+            "recommendedProgramId": str(decision.get("recommendedProgramId") or "").strip(),
+            "recommendedProgramName": str(decision.get("recommendedProgramName") or "").strip(),
+            "recommendedInterventions": interventions[:10],
+            "summary": str(decision.get("summary") or "").strip(),
+            "classificationReasons": [
+                str(x).strip()
+                for x in decision.get("classificationReasons", [])
+                if str(x).strip()
+            ][:10],
+            "redFlags": [
+                str(x).strip()
+                for x in decision.get("redFlags", [])
+                if str(x).strip()
+            ][:10],
+            "growthSignals": [
+                str(x).strip()
+                for x in decision.get("growthSignals", [])
+                if str(x).strip()
+            ][:10],
+            "benefitsOfProgram": [
+                str(x).strip()
+                for x in decision.get("benefitsOfProgram", [])
+                if str(x).strip()
+            ][:6],
+            "benefitsOfAgents": [
+                str(x).strip()
+                for x in decision.get("benefitsOfAgents", [])
+                if str(x).strip()
+            ][:6],
+        },
+        "warnings": [
+            str(x).strip()
+            for x in raw.get("warnings", [])
+            if str(x).strip()
+        ],
+    }
 # -- route ---------------------------------------------------------------
 @app.route('/chat', methods=['POST'])
             "error": "Failed to generate course outline from file"
         }), 500
+@app.route('/analyze-sme-application-intake', methods=['POST'])
+def analyze_sme_application_intake():
+    """
+    Multipart form-data endpoint for unregistered SME intake.
+    Expected form fields:
+      - mode: initial_review | final_decision
+      - companyCode
+      - userId optional
+      - contactName
+      - contactEmail
+      - contactPhone
+      - rawStory
+      - missingAnswersJson optional
+      - requiredDocumentsJson optional
+      - files[] optional
+    Response:
+      {
+        detectedLanguages,
+        confidence,
+        extractedProfile,
+        extractedApplication,
+        documentFindings,
+        missingFields,
+        missingDocuments,
+        nextQuestion,
+        decision,
+        warnings
+      }
+    """
+    try:
+        mode = request.form.get("mode") or "initial_review"
+        company_code = request.form.get("companyCode")
+        user_id = request.form.get("userId") or ""
+        contact_name = request.form.get("contactName") or ""
+        contact_email = request.form.get("email") or request.form.get("contactEmail") or ""
+        contact_phone = request.form.get("contactPhone") or ""
+        raw_story = (request.form.get("rawStory") or "").strip()
+        if not company_code:
+            return jsonify({"error": "Missing companyCode"}), 400
+        if not raw_story:
+            return jsonify({"error": "Missing rawStory"}), 400
+        try:
+            missing_answers = json.loads(request.form.get("missingAnswersJson") or "{}")
+        except Exception:
+            missing_answers = {}
+        try:
+            required_documents = json.loads(request.form.get("requiredDocumentsJson") or "[]")
+        except Exception:
+            required_documents = []
+        uploaded_files = request.files.getlist("files")
+        document_payloads = []
+        for uploaded in uploaded_files:
+            filename = uploaded.filename or ""
+            if not filename:
+                continue
+            if not _allowed_sme_doc(filename):
+                document_payloads.append({
+                    "filename": filename,
+                    "contentType": uploaded.content_type,
+                    "extractedText": "",
+                    "warnings": ["Unsupported document type."]
+                })
+                continue
+            file_bytes = uploaded.read()
+            if not file_bytes:
+                document_payloads.append({
+                    "filename": filename,
+                    "contentType": uploaded.content_type,
+                    "extractedText": "",
+                    "warnings": ["Uploaded file was empty."]
+                })
+                continue
+            extracted_text = _extract_sme_doc_text(filename, file_bytes)
+            truncated_text = _truncate_source_text(extracted_text, MAX_SME_DOC_TEXT_CHARS)
+            warnings = []
+            ext = filename.rsplit(".", 1)[1].lower()
+            if ext in ["png", "jpg", "jpeg"]:
+                warnings.append("Image OCR is not enabled on this endpoint yet.")
+            if extracted_text and len(truncated_text) < len(extracted_text):
+                warnings.append("Document text was truncated before AI analysis.")
+            if not extracted_text and ext not in ["png", "jpg", "jpeg"]:
+                warnings.append("No readable text could be extracted from this document.")
+            document_payloads.append({
+                "filename": filename,
+                "contentType": uploaded.content_type,
+                "extractedText": truncated_text,
+                "warnings": warnings
+            })
+        available_programs = _fetch_active_programs(company_code)
+        intervention_catalog = _fetch_intervention_catalog(company_code)
+        payload = {
+            "mode": mode,
+            "companyCode": company_code,
+            "userId": user_id,
+            "contact": {
+                "name": contact_name,
+                "email": contact_email,
+                "phone": contact_phone,
+            },
+            "rawStory": raw_story,
+            "missingAnswers": missing_answers,
+            "requiredDocuments": required_documents,
+            "uploadedDocuments": document_payloads,
+            "availablePrograms": available_programs,
+            "interventionCatalog": intervention_catalog,
+            "analysisDate": datetime.utcnow().isoformat(),
+        }
+        system_msg = {
+            "role": "system",
+            "content": (
+                "You analyse SME intake information for an incubation platform. "
+                "You return strict JSON only. "
+                "You must not invent official facts, registration numbers, tax numbers, "
+                "document statuses, revenue, employees, or compliance claims."
+            )
+        }
+        user_msg = {
+            "role": "user",
+            "content": _build_sme_intake_prompt(payload)
+        }
+        ai_raw = ask_gpt([system_msg, user_msg])
+        ai_result = _extract_json_block(ai_raw)
+        normalized = _normalize_sme_intake_result(ai_result)
+        return jsonify(to_jsonable(normalized))
+    except Exception as e:
+        print("analyze_sme_application_intake_failed:", e)
+        return jsonify({
+            "error": "Failed to analyse SME application intake"
+        }), 500
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)