Aoun-Ai / app /services /document /document_analyzer.py
MuhammadMahmoud's picture
feat: clean deployment with bug fixes and stability improvements
18b8b90
"""
Document Analyzer โ€” Sends OCR-extracted JSON to Groq LLM for intelligent analysis.
Produces: summary, risk_level, severity_score, key_findings, recommendation.
Uses the existing GROQ_API_KEY โ€” zero additional cost.
"""
import json
import logging
from app.core.config import settings
logger = logging.getLogger(__name__)
# โ”€โ”€โ”€ Arabic Prompt Templates per document type โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_BASE_PROMPT = """\
ุฃู†ุช ู…ุญู„ู„ ูˆุซุงุฆู‚ ู…ุชุฎุตุต ููŠ ู…ู†ุตุฉ ุนูˆู† ู„ู„ู…ุณุงุนุฏุฉ ุงู„ุงุฌุชู…ุงุนูŠุฉ.
ุชู… ุงุณุชุฎุฑุงุฌ ุงู„ุจูŠุงู†ุงุช ุงู„ุชุงู„ูŠุฉ ู…ู† ูˆุซูŠู‚ุฉ ู…ู† ู†ูˆุน "{doc_type}" ุจุงุณุชุฎุฏุงู… OCR:
{fields_json}
ู‚ู… ุจุชุญู„ูŠู„ ู‡ุฐู‡ ุงู„ุจูŠุงู†ุงุช ูˆุฃุนุฏ ุฑุฏู‹ุง ุจุชู†ุณูŠู‚ JSON ูู‚ุท โ€” ุจุฏูˆู† ุฃูŠ ุดุฑุญ ุฃูˆ ู†ุต ุฎุงุฑุฌ JSON.
ุงู„ุญู‚ูˆู„ ุงู„ู…ุทู„ูˆุจุฉ:
{{
"summary": "ู…ู„ุฎุต ู…ูˆุฌุฒ ู„ู„ูˆุซูŠู‚ุฉ ุจุงู„ุนุฑุจูŠุฉ (ุฌู…ู„ุฉ ุฃูˆ ุฌู…ู„ุชุงู†)",
"risk_level": "high | medium | low",
"severity_score": <ุฑู‚ู… ู…ู† 0 ุฅู„ู‰ 100>,
"key_findings": ["ุงูƒุชุดุงู 1", "ุงูƒุชุดุงู 2", ...],
"recommendation": "ุชูˆุตูŠุฉ ูˆุงุญุฏุฉ ูˆุงุถุญุฉ ู„ู„ู…ูˆุธู ุงู„ู…ุณุคูˆู„",
"confidence": <ุฑู‚ู… ู…ู† 0.0 ุฅู„ู‰ 1.0 ูŠุนุจุฑ ุนู† ู…ุฏู‰ ุงูƒุชู…ุงู„ ุงู„ุจูŠุงู†ุงุช>,
"tampering_detected": true | false
}}
ู‚ูˆุงุนุฏ ุงู„ุชู‚ูŠูŠู…:
- risk_level=high โ†’ ู…ุฑุถ ู…ุฒู…ู† / ุฏุฎู„ ุฃู‚ู„ ู…ู† 1500 ุฌู†ูŠู‡ / ุฏูŠูˆู† ุชุชุฌุงูˆุฒ ุถุนู ุงู„ุฏุฎู„
- risk_level=medium โ†’ ูˆุถุน ู…ุชูˆุณุท ูŠุญุชุงุฌ ู…ุชุงุจุนุฉ
- risk_level=low โ†’ ูˆุถุน ู…ุณุชู‚ุฑ ู„ุง ูŠุณุชุฏุนูŠ ุชุฏุฎู„ุงู‹ ุนุงุฌู„ุงู‹
- ู„ูˆ ููŠ ุญู‚ูˆู„ ู†ุงู‚ุตุฉ (null) โ†’ ุงุฐูƒุฑู‡ุง ููŠ key_findings ูˆุฎูุถ confidence
- ู„ู„ูƒุดู ุนู† ุงู„ุชุฒูˆูŠุฑ (tampering_detected=true): ู‡ู„ ู‡ู†ุงูƒ ุชู†ุงู‚ุถ ุตุฑูŠุญ ููŠ ุงู„ุชูˆุงุฑูŠุฎุŸ ู‡ู„ ุงู„ุฃุฑู‚ุงู… ุฃูˆ ุงู„ู†ุตูˆุต ุชุจุฏูˆ ุบูŠุฑ ู…ู†ุทู‚ูŠุฉ ุฃูˆ ู…ุชู‚ุทุนุฉ ุจุดูƒู„ ูŠุฏู„ ุนู„ู‰ ุชู„ุงุนุจุŸ
- ุฃุฌุจ ุจู€ JSON ูู‚ุท ุจุฏูˆู† markdown
"""
# โ”€โ”€โ”€ Analyzer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class DocumentAnalyzer:
"""LLM-powered document intelligence layer, built on top of OCR output."""
def __init__(self):
if not settings.GROQ_API_KEY:
logger.warning("GROQ_API_KEY missing โ€” Document Analyzer unavailable.")
self.client = None
else:
from groq import AsyncGroq
self.client = AsyncGroq(api_key=settings.GROQ_API_KEY, timeout=30.0)
logger.info("Document Analyzer initialized (Groq).")
def is_available(self) -> bool:
return self.client is not None
async def analyze(self, ocr_fields: dict, document_type: str) -> dict:
"""
Analyze OCR-extracted fields using LLM.
Args:
ocr_fields: The dict returned by OCR (without _provider key).
document_type: e.g. "medical_report", "income_proof".
Returns:
dict matching DocumentAnalysis schema, or a graceful fallback.
"""
if not self.client:
return self._unavailable_fallback(document_type)
# Remove internal keys before sending to LLM
clean_fields = {k: v for k, v in ocr_fields.items() if not k.startswith("_")}
# Count null fields for a quick completeness hint
null_count = sum(1 for v in clean_fields.values() if v is None)
total_fields = len(clean_fields) or 1
prompt = _BASE_PROMPT.format(
doc_type=document_type,
fields_json=json.dumps(clean_fields, ensure_ascii=False, indent=2),
)
try:
response = await self.client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[{"role": "user", "content": prompt}],
temperature=0.1, # Low temp โ†’ deterministic structured output
max_tokens=512,
)
raw = response.choices[0].message.content.strip()
# Strip markdown fences if present
if "```json" in raw:
raw = raw.split("```json")[1].split("```")[0].strip()
elif "```" in raw:
raw = raw.split("```")[1].split("```")[0].strip()
result = json.loads(raw)
# Validate required keys โ€” fill missing with defaults
result.setdefault("summary", "ุชุนุฐู‘ุฑ ุฅู†ุดุงุก ุงู„ู…ู„ุฎุต.")
result.setdefault("risk_level", "medium")
result.setdefault("severity_score", 50)
result.setdefault("key_findings", [])
result.setdefault("recommendation", "ูŠูุฑุฌู‰ ู…ุฑุงุฌุนุฉ ุงู„ูˆุซูŠู‚ุฉ ูŠุฏูˆูŠุงู‹.")
result.setdefault("confidence", round(1 - (null_count / total_fields), 2))
result.setdefault("tampering_detected", False)
# If tampering is detected, override recommendation
if result.get("tampering_detected"):
result["recommendation"] = "๐Ÿšจ ุชุญุฐูŠุฑ: ุงุดุชุจุงู‡ ููŠ ุชู„ุงุนุจ ุจุงู„ูˆุซูŠู‚ุฉ. " + result["recommendation"]
# Clamp severity_score
result["severity_score"] = max(0, min(100, int(result["severity_score"])))
result["risk_level"] = result["risk_level"].lower()
if result["risk_level"] not in ("high", "medium", "low"):
result["risk_level"] = "medium"
return result
except json.JSONDecodeError as e:
logger.error("LLM returned non-JSON response: %s | raw=%s", e, raw[:200])
return self._parse_error_fallback(document_type)
except Exception as e:
logger.error("Document analysis failed: %s", e, exc_info=True)
return self._error_fallback(document_type)
# โ”€โ”€โ”€ Fallbacks โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@staticmethod
def _unavailable_fallback(doc_type: str) -> dict:
return {
"summary": "ุฎุฏู…ุฉ ุงู„ุชุญู„ูŠู„ ุบูŠุฑ ู…ุชุงุญุฉ ุญุงู„ูŠุงู‹.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ู„ู… ูŠุชู… ุฅุฌุฑุงุก ุงู„ุชุญู„ูŠู„ โ€” GROQ_API_KEY ุบูŠุฑ ู…ุถุจูˆุท."],
"recommendation": "ูŠูุฑุฌู‰ ุถุจุท GROQ_API_KEY ูˆุฅุนุงุฏุฉ ุงู„ู…ุญุงูˆู„ุฉ.",
"confidence": 0.0,
"tampering_detected": False,
}
@staticmethod
def _parse_error_fallback(doc_type: str) -> dict:
return {
"summary": "ุชุนุฐู‘ุฑ ุชุญู„ูŠู„ ุงู„ูˆุซูŠู‚ุฉ โ€” ุงู„ุฑุฏ ุบูŠุฑ ุตุงู„ุญ.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ูุดู„ ุชุญู„ูŠู„ ุฑุฏ ุงู„ู†ู…ูˆุฐุฌ โ€” ูŠูุฑุฌู‰ ุงู„ู…ุฑุงุฌุนุฉ ุงู„ูŠุฏูˆูŠุฉ."],
"recommendation": "ุฑุงุฌุน ุงู„ุจูŠุงู†ุงุช ุงู„ู…ุณุชุฎุฑุฌุฉ ูŠุฏูˆูŠุงู‹.",
"confidence": 0.0,
"tampering_detected": False,
}
@staticmethod
def _error_fallback(doc_type: str) -> dict:
return {
"summary": "ุญุฏุซ ุฎุทุฃ ุฃุซู†ุงุก ุงู„ุชุญู„ูŠู„.",
"risk_level": "medium",
"severity_score": 50,
"key_findings": ["ุชุนุฐู‘ุฑ ุงู„ุชุญู„ูŠู„ ุจุณุจุจ ุฎุทุฃ ุชู‚ู†ูŠ."],
"recommendation": "ูŠูุฑุฌู‰ ุงู„ู…ุญุงูˆู„ุฉ ู…ุฌุฏุฏุงู‹ ุฃูˆ ุงู„ู…ุฑุงุฌุนุฉ ุงู„ูŠุฏูˆูŠุฉ.",
"confidence": 0.0,
"tampering_detected": False,
}
# Singleton
document_analyzer = DocumentAnalyzer()