File size: 7,779 Bytes
18b8b90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Document Analyzer โ€” Sends OCR-extracted JSON to Groq LLM for intelligent analysis.
Produces: summary, risk_level, severity_score, key_findings, recommendation.
Uses the existing GROQ_API_KEY โ€” zero additional cost.
"""

import json
import logging
from app.core.config import settings

logger = logging.getLogger(__name__)

# โ”€โ”€โ”€ Arabic Prompt Templates per document type โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

_BASE_PROMPT = """\
ุฃู†ุช ู…ุญู„ู„ ูˆุซุงุฆู‚ ู…ุชุฎุตุต ููŠ ู…ู†ุตุฉ ุนูˆู† ู„ู„ู…ุณุงุนุฏุฉ ุงู„ุงุฌุชู…ุงุนูŠุฉ.
ุชู… ุงุณุชุฎุฑุงุฌ ุงู„ุจูŠุงู†ุงุช ุงู„ุชุงู„ูŠุฉ ู…ู† ูˆุซูŠู‚ุฉ ู…ู† ู†ูˆุน "{doc_type}" ุจุงุณุชุฎุฏุงู… OCR:

{fields_json}

ู‚ู… ุจุชุญู„ูŠู„ ู‡ุฐู‡ ุงู„ุจูŠุงู†ุงุช ูˆุฃุนุฏ ุฑุฏู‹ุง ุจุชู†ุณูŠู‚ JSON ูู‚ุท โ€” ุจุฏูˆู† ุฃูŠ ุดุฑุญ ุฃูˆ ู†ุต ุฎุงุฑุฌ JSON.

ุงู„ุญู‚ูˆู„ ุงู„ู…ุทู„ูˆุจุฉ:
{{
  "summary": "ู…ู„ุฎุต ู…ูˆุฌุฒ ู„ู„ูˆุซูŠู‚ุฉ ุจุงู„ุนุฑุจูŠุฉ (ุฌู…ู„ุฉ ุฃูˆ ุฌู…ู„ุชุงู†)",
  "risk_level": "high | medium | low",
  "severity_score": <ุฑู‚ู… ู…ู† 0 ุฅู„ู‰ 100>,
  "key_findings": ["ุงูƒุชุดุงู 1", "ุงูƒุชุดุงู 2", ...],
  "recommendation": "ุชูˆุตูŠุฉ ูˆุงุญุฏุฉ ูˆุงุถุญุฉ ู„ู„ู…ูˆุธู ุงู„ู…ุณุคูˆู„",
  "confidence": <ุฑู‚ู… ู…ู† 0.0 ุฅู„ู‰ 1.0 ูŠุนุจุฑ ุนู† ู…ุฏู‰ ุงูƒุชู…ุงู„ ุงู„ุจูŠุงู†ุงุช>,
  "tampering_detected": true | false
}}

ู‚ูˆุงุนุฏ ุงู„ุชู‚ูŠูŠู…:
- risk_level=high   โ†’ ู…ุฑุถ ู…ุฒู…ู† / ุฏุฎู„ ุฃู‚ู„ ู…ู† 1500 ุฌู†ูŠู‡ / ุฏูŠูˆู† ุชุชุฌุงูˆุฒ ุถุนู ุงู„ุฏุฎู„
- risk_level=medium โ†’ ูˆุถุน ู…ุชูˆุณุท ูŠุญุชุงุฌ ู…ุชุงุจุนุฉ
- risk_level=low    โ†’ ูˆุถุน ู…ุณุชู‚ุฑ ู„ุง ูŠุณุชุฏุนูŠ ุชุฏุฎู„ุงู‹ ุนุงุฌู„ุงู‹
- ู„ูˆ ููŠ ุญู‚ูˆู„ ู†ุงู‚ุตุฉ (null) โ†’ ุงุฐูƒุฑู‡ุง ููŠ key_findings ูˆุฎูุถ confidence
- ู„ู„ูƒุดู ุนู† ุงู„ุชุฒูˆูŠุฑ (tampering_detected=true): ู‡ู„ ู‡ู†ุงูƒ ุชู†ุงู‚ุถ ุตุฑูŠุญ ููŠ ุงู„ุชูˆุงุฑูŠุฎุŸ ู‡ู„ ุงู„ุฃุฑู‚ุงู… ุฃูˆ ุงู„ู†ุตูˆุต ุชุจุฏูˆ ุบูŠุฑ ู…ู†ุทู‚ูŠุฉ ุฃูˆ ู…ุชู‚ุทุนุฉ ุจุดูƒู„ ูŠุฏู„ ุนู„ู‰ ุชู„ุงุนุจุŸ
- ุฃุฌุจ ุจู€ JSON ูู‚ุท ุจุฏูˆู† markdown
"""

# โ”€โ”€โ”€ Analyzer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

class DocumentAnalyzer:
    """LLM-powered document intelligence layer, built on top of OCR output."""

    def __init__(self):
        if not settings.GROQ_API_KEY:
            logger.warning("GROQ_API_KEY missing โ€” Document Analyzer unavailable.")
            self.client = None
        else:
            from groq import AsyncGroq
            self.client = AsyncGroq(api_key=settings.GROQ_API_KEY, timeout=30.0)
            logger.info("Document Analyzer initialized (Groq).")

    def is_available(self) -> bool:
        return self.client is not None

    async def analyze(self, ocr_fields: dict, document_type: str) -> dict:
        """
        Analyze OCR-extracted fields using LLM.

        Args:
            ocr_fields: The dict returned by OCR (without _provider key).
            document_type: e.g. "medical_report", "income_proof".

        Returns:
            dict matching DocumentAnalysis schema, or a graceful fallback.
        """
        if not self.client:
            return self._unavailable_fallback(document_type)

        # Remove internal keys before sending to LLM
        clean_fields = {k: v for k, v in ocr_fields.items() if not k.startswith("_")}

        # Count null fields for a quick completeness hint
        null_count = sum(1 for v in clean_fields.values() if v is None)
        total_fields = len(clean_fields) or 1

        prompt = _BASE_PROMPT.format(
            doc_type=document_type,
            fields_json=json.dumps(clean_fields, ensure_ascii=False, indent=2),
        )

        try:
            response = await self.client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,   # Low temp โ†’ deterministic structured output
                max_tokens=512,
            )
            raw = response.choices[0].message.content.strip()

            # Strip markdown fences if present
            if "```json" in raw:
                raw = raw.split("```json")[1].split("```")[0].strip()
            elif "```" in raw:
                raw = raw.split("```")[1].split("```")[0].strip()

            result = json.loads(raw)

            # Validate required keys โ€” fill missing with defaults
            result.setdefault("summary", "ุชุนุฐู‘ุฑ ุฅู†ุดุงุก ุงู„ู…ู„ุฎุต.")
            result.setdefault("risk_level", "medium")
            result.setdefault("severity_score", 50)
            result.setdefault("key_findings", [])
            result.setdefault("recommendation", "ูŠูุฑุฌู‰ ู…ุฑุงุฌุนุฉ ุงู„ูˆุซูŠู‚ุฉ ูŠุฏูˆูŠุงู‹.")
            result.setdefault("confidence", round(1 - (null_count / total_fields), 2))
            result.setdefault("tampering_detected", False)

            # If tampering is detected, override recommendation
            if result.get("tampering_detected"):
                result["recommendation"] = "๐Ÿšจ ุชุญุฐูŠุฑ: ุงุดุชุจุงู‡ ููŠ ุชู„ุงุนุจ ุจุงู„ูˆุซูŠู‚ุฉ. " + result["recommendation"]

            # Clamp severity_score
            result["severity_score"] = max(0, min(100, int(result["severity_score"])))
            result["risk_level"] = result["risk_level"].lower()
            if result["risk_level"] not in ("high", "medium", "low"):
                result["risk_level"] = "medium"

            return result

        except json.JSONDecodeError as e:
            logger.error("LLM returned non-JSON response: %s | raw=%s", e, raw[:200])
            return self._parse_error_fallback(document_type)
        except Exception as e:
            logger.error("Document analysis failed: %s", e, exc_info=True)
            return self._error_fallback(document_type)

    # โ”€โ”€โ”€ Fallbacks โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

    @staticmethod
    def _unavailable_fallback(doc_type: str) -> dict:
        return {
            "summary": "ุฎุฏู…ุฉ ุงู„ุชุญู„ูŠู„ ุบูŠุฑ ู…ุชุงุญุฉ ุญุงู„ูŠุงู‹.",
            "risk_level": "medium",
            "severity_score": 50,
            "key_findings": ["ู„ู… ูŠุชู… ุฅุฌุฑุงุก ุงู„ุชุญู„ูŠู„ โ€” GROQ_API_KEY ุบูŠุฑ ู…ุถุจูˆุท."],
            "recommendation": "ูŠูุฑุฌู‰ ุถุจุท GROQ_API_KEY ูˆุฅุนุงุฏุฉ ุงู„ู…ุญุงูˆู„ุฉ.",
            "confidence": 0.0,
            "tampering_detected": False,
        }

    @staticmethod
    def _parse_error_fallback(doc_type: str) -> dict:
        return {
            "summary": "ุชุนุฐู‘ุฑ ุชุญู„ูŠู„ ุงู„ูˆุซูŠู‚ุฉ โ€” ุงู„ุฑุฏ ุบูŠุฑ ุตุงู„ุญ.",
            "risk_level": "medium",
            "severity_score": 50,
            "key_findings": ["ูุดู„ ุชุญู„ูŠู„ ุฑุฏ ุงู„ู†ู…ูˆุฐุฌ โ€” ูŠูุฑุฌู‰ ุงู„ู…ุฑุงุฌุนุฉ ุงู„ูŠุฏูˆูŠุฉ."],
            "recommendation": "ุฑุงุฌุน ุงู„ุจูŠุงู†ุงุช ุงู„ู…ุณุชุฎุฑุฌุฉ ูŠุฏูˆูŠุงู‹.",
            "confidence": 0.0,
            "tampering_detected": False,
        }

    @staticmethod
    def _error_fallback(doc_type: str) -> dict:
        return {
            "summary": "ุญุฏุซ ุฎุทุฃ ุฃุซู†ุงุก ุงู„ุชุญู„ูŠู„.",
            "risk_level": "medium",
            "severity_score": 50,
            "key_findings": ["ุชุนุฐู‘ุฑ ุงู„ุชุญู„ูŠู„ ุจุณุจุจ ุฎุทุฃ ุชู‚ู†ูŠ."],
            "recommendation": "ูŠูุฑุฌู‰ ุงู„ู…ุญุงูˆู„ุฉ ู…ุฌุฏุฏุงู‹ ุฃูˆ ุงู„ู…ุฑุงุฌุนุฉ ุงู„ูŠุฏูˆูŠุฉ.",
            "confidence": 0.0,
            "tampering_detected": False,
        }


# Singleton
document_analyzer = DocumentAnalyzer()