Spaces:
Running
Running
File size: 7,779 Bytes
18b8b90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """
Document Analyzer โ Sends OCR-extracted JSON to Groq LLM for intelligent analysis.
Produces: summary, risk_level, severity_score, key_findings, recommendation.
Uses the existing GROQ_API_KEY โ zero additional cost.
"""
import json
import logging
from app.core.config import settings
logger = logging.getLogger(__name__)
# โโโ Arabic Prompt Templates per document type โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
_BASE_PROMPT = """\
ุฃูุช ู
ุญูู ูุซุงุฆู ู
ุชุฎุตุต ูู ู
ูุตุฉ ุนูู ููู
ุณุงุนุฏุฉ ุงูุงุฌุชู
ุงุนูุฉ.
ุชู
ุงุณุชุฎุฑุงุฌ ุงูุจูุงูุงุช ุงูุชุงููุฉ ู
ู ูุซููุฉ ู
ู ููุน "{doc_type}" ุจุงุณุชุฎุฏุงู
OCR:
{fields_json}
ูู
ุจุชุญููู ูุฐู ุงูุจูุงูุงุช ูุฃุนุฏ ุฑุฏูุง ุจุชูุณูู JSON ููุท โ ุจุฏูู ุฃู ุดุฑุญ ุฃู ูุต ุฎุงุฑุฌ JSON.
ุงูุญููู ุงูู
ุทููุจุฉ:
{{
"summary": "ู
ูุฎุต ู
ูุฌุฒ ูููุซููุฉ ุจุงูุนุฑุจูุฉ (ุฌู
ูุฉ ุฃู ุฌู
ูุชุงู)",
"risk_level": "high | medium | low",
"severity_score": <ุฑูู
ู
ู 0 ุฅูู 100>,
"key_findings": ["ุงูุชุดุงู 1", "ุงูุชุดุงู 2", ...],
"recommendation": "ุชูุตูุฉ ูุงุญุฏุฉ ูุงุถุญุฉ ููู
ูุธู ุงูู
ุณุคูู",
"confidence": <ุฑูู
ู
ู 0.0 ุฅูู 1.0 ูุนุจุฑ ุนู ู
ุฏู ุงูุชู
ุงู ุงูุจูุงูุงุช>,
"tampering_detected": true | false
}}
ููุงุนุฏ ุงูุชูููู
:
- risk_level=high โ ู
ุฑุถ ู
ุฒู
ู / ุฏุฎู ุฃูู ู
ู 1500 ุฌููู / ุฏููู ุชุชุฌุงูุฒ ุถุนู ุงูุฏุฎู
- risk_level=medium โ ูุถุน ู
ุชูุณุท ูุญุชุงุฌ ู
ุชุงุจุนุฉ
- risk_level=low โ ูุถุน ู
ุณุชูุฑ ูุง ูุณุชุฏุนู ุชุฏุฎูุงู ุนุงุฌูุงู
- ูู ูู ุญููู ูุงูุตุฉ (null) โ ุงุฐูุฑูุง ูู key_findings ูุฎูุถ confidence
- ูููุดู ุนู ุงูุชุฒููุฑ (tampering_detected=true): ูู ููุงู ุชูุงูุถ ุตุฑูุญ ูู ุงูุชูุงุฑูุฎุ ูู ุงูุฃุฑูุงู
ุฃู ุงููุตูุต ุชุจุฏู ุบูุฑ ู
ูุทููุฉ ุฃู ู
ุชูุทุนุฉ ุจุดูู ูุฏู ุนูู ุชูุงุนุจุ
- ุฃุฌุจ ุจู JSON ููุท ุจุฏูู markdown
"""
# โโโ Analyzer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class DocumentAnalyzer:
"""LLM-powered document intelligence layer, built on top of OCR output."""
def __init__(self):
if not settings.GROQ_API_KEY:
logger.warning("GROQ_API_KEY missing โ Document Analyzer unavailable.")
self.client = None
else:
from groq import AsyncGroq
self.client = AsyncGroq(api_key=settings.GROQ_API_KEY, timeout=30.0)
logger.info("Document Analyzer initialized (Groq).")
def is_available(self) -> bool:
return self.client is not None
async def analyze(self, ocr_fields: dict, document_type: str) -> dict:
"""
Analyze OCR-extracted fields using LLM.
Args:
ocr_fields: The dict returned by OCR (without _provider key).
document_type: e.g. "medical_report", "income_proof".
Returns:
dict matching DocumentAnalysis schema, or a graceful fallback.
"""
if not self.client:
return self._unavailable_fallback(document_type)
# Remove internal keys before sending to LLM
clean_fields = {k: v for k, v in ocr_fields.items() if not k.startswith("_")}
# Count null fields for a quick completeness hint
null_count = sum(1 for v in clean_fields.values() if v is None)
total_fields = len(clean_fields) or 1
prompt = _BASE_PROMPT.format(
doc_type=document_type,
fields_json=json.dumps(clean_fields, ensure_ascii=False, indent=2),
)
try:
response = await self.client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[{"role": "user", "content": prompt}],
temperature=0.1, # Low temp โ deterministic structured output
max_tokens=512,
)
raw = response.choices[0].message.content.strip()
# Strip markdown fences if present
if "```json" in raw:
raw = raw.split("```json")[1].split("```")[0].strip()
elif "```" in raw:
raw = raw.split("```")[1].split("```")[0].strip()
result = json.loads(raw)
# Validate required keys โ fill missing with defaults
result.setdefault("summary", "ุชุนุฐูุฑ ุฅูุดุงุก ุงูู
ูุฎุต.")
result.setdefault("risk_level", "medium")
result.setdefault("severity_score", 50)
result.setdefault("key_findings", [])
result.setdefault("recommendation", "ููุฑุฌู ู
ุฑุงุฌุนุฉ ุงููุซููุฉ ูุฏููุงู.")
result.setdefault("confidence", round(1 - (null_count / total_fields), 2))
result.setdefault("tampering_detected", False)
# If tampering is detected, override recommendation
if result.get("tampering_detected"):
result["recommendation"] = "๐จ ุชุญุฐูุฑ: ุงุดุชุจุงู ูู ุชูุงุนุจ ุจุงููุซููุฉ. " + result["recommendation"]
# Clamp severity_score
result["severity_score"] = max(0, min(100, int(result["severity_score"])))
result["risk_level"] = result["risk_level"].lower()
if result["risk_level"] not in ("high", "medium", "low"):
result["risk_level"] = "medium"
return result
except json.JSONDecodeError as e:
logger.error("LLM returned non-JSON response: %s | raw=%s", e, raw[:200])
return self._parse_error_fallback(document_type)
except Exception as e:
logger.error("Document analysis failed: %s", e, exc_info=True)
return self._error_fallback(document_type)
# โโโ Fallbacks โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
@staticmethod
def _unavailable_fallback(doc_type: str) -> dict:
return {
"summary": "ุฎุฏู
ุฉ ุงูุชุญููู ุบูุฑ ู
ุชุงุญุฉ ุญุงููุงู.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ูู
ูุชู
ุฅุฌุฑุงุก ุงูุชุญููู โ GROQ_API_KEY ุบูุฑ ู
ุถุจูุท."],
"recommendation": "ููุฑุฌู ุถุจุท GROQ_API_KEY ูุฅุนุงุฏุฉ ุงูู
ุญุงููุฉ.",
"confidence": 0.0,
"tampering_detected": False,
}
@staticmethod
def _parse_error_fallback(doc_type: str) -> dict:
return {
"summary": "ุชุนุฐูุฑ ุชุญููู ุงููุซููุฉ โ ุงูุฑุฏ ุบูุฑ ุตุงูุญ.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ูุดู ุชุญููู ุฑุฏ ุงููู
ูุฐุฌ โ ููุฑุฌู ุงูู
ุฑุงุฌุนุฉ ุงููุฏููุฉ."],
"recommendation": "ุฑุงุฌุน ุงูุจูุงูุงุช ุงูู
ุณุชุฎุฑุฌุฉ ูุฏููุงู.",
"confidence": 0.0,
"tampering_detected": False,
}
@staticmethod
def _error_fallback(doc_type: str) -> dict:
return {
"summary": "ุญุฏุซ ุฎุทุฃ ุฃุซูุงุก ุงูุชุญููู.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ุชุนุฐูุฑ ุงูุชุญููู ุจุณุจุจ ุฎุทุฃ ุชููู."],
"recommendation": "ููุฑุฌู ุงูู
ุญุงููุฉ ู
ุฌุฏุฏุงู ุฃู ุงูู
ุฑุงุฌุนุฉ ุงููุฏููุฉ.",
"confidence": 0.0,
"tampering_detected": False,
}
# Singleton
document_analyzer = DocumentAnalyzer()
|