Spaces:
Running
Running
| """ | |
| Document Analyzer โ Sends OCR-extracted JSON to Groq LLM for intelligent analysis. | |
| Produces: summary, risk_level, severity_score, key_findings, recommendation. | |
| Uses the existing GROQ_API_KEY โ zero additional cost. | |
| """ | |
| import json | |
| import logging | |
| from app.core.config import settings | |
| logger = logging.getLogger(__name__) | |
| # โโโ Arabic Prompt Templates per document type โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _BASE_PROMPT = """\ | |
| ุฃูุช ู ุญูู ูุซุงุฆู ู ุชุฎุตุต ูู ู ูุตุฉ ุนูู ููู ุณุงุนุฏุฉ ุงูุงุฌุชู ุงุนูุฉ. | |
| ุชู ุงุณุชุฎุฑุงุฌ ุงูุจูุงูุงุช ุงูุชุงููุฉ ู ู ูุซููุฉ ู ู ููุน "{doc_type}" ุจุงุณุชุฎุฏุงู OCR: | |
| {fields_json} | |
| ูู ุจุชุญููู ูุฐู ุงูุจูุงูุงุช ูุฃุนุฏ ุฑุฏูุง ุจุชูุณูู JSON ููุท โ ุจุฏูู ุฃู ุดุฑุญ ุฃู ูุต ุฎุงุฑุฌ JSON. | |
| ุงูุญููู ุงูู ุทููุจุฉ: | |
| {{ | |
| "summary": "ู ูุฎุต ู ูุฌุฒ ูููุซููุฉ ุจุงูุนุฑุจูุฉ (ุฌู ูุฉ ุฃู ุฌู ูุชุงู)", | |
| "risk_level": "high | medium | low", | |
| "severity_score": <ุฑูู ู ู 0 ุฅูู 100>, | |
| "key_findings": ["ุงูุชุดุงู 1", "ุงูุชุดุงู 2", ...], | |
| "recommendation": "ุชูุตูุฉ ูุงุญุฏุฉ ูุงุถุญุฉ ููู ูุธู ุงูู ุณุคูู", | |
| "confidence": <ุฑูู ู ู 0.0 ุฅูู 1.0 ูุนุจุฑ ุนู ู ุฏู ุงูุชู ุงู ุงูุจูุงูุงุช>, | |
| "tampering_detected": true | false | |
| }} | |
| ููุงุนุฏ ุงูุชูููู : | |
| - risk_level=high โ ู ุฑุถ ู ุฒู ู / ุฏุฎู ุฃูู ู ู 1500 ุฌููู / ุฏููู ุชุชุฌุงูุฒ ุถุนู ุงูุฏุฎู | |
| - risk_level=medium โ ูุถุน ู ุชูุณุท ูุญุชุงุฌ ู ุชุงุจุนุฉ | |
| - risk_level=low โ ูุถุน ู ุณุชูุฑ ูุง ูุณุชุฏุนู ุชุฏุฎูุงู ุนุงุฌูุงู | |
| - ูู ูู ุญููู ูุงูุตุฉ (null) โ ุงุฐูุฑูุง ูู key_findings ูุฎูุถ confidence | |
| - ูููุดู ุนู ุงูุชุฒููุฑ (tampering_detected=true): ูู ููุงู ุชูุงูุถ ุตุฑูุญ ูู ุงูุชูุงุฑูุฎุ ูู ุงูุฃุฑูุงู ุฃู ุงููุตูุต ุชุจุฏู ุบูุฑ ู ูุทููุฉ ุฃู ู ุชูุทุนุฉ ุจุดูู ูุฏู ุนูู ุชูุงุนุจุ | |
| - ุฃุฌุจ ุจู JSON ููุท ุจุฏูู markdown | |
| """ | |
| # โโโ Analyzer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class DocumentAnalyzer: | |
| """LLM-powered document intelligence layer, built on top of OCR output.""" | |
| def __init__(self): | |
| if not settings.GROQ_API_KEY: | |
| logger.warning("GROQ_API_KEY missing โ Document Analyzer unavailable.") | |
| self.client = None | |
| else: | |
| from groq import AsyncGroq | |
| self.client = AsyncGroq(api_key=settings.GROQ_API_KEY, timeout=30.0) | |
| logger.info("Document Analyzer initialized (Groq).") | |
| def is_available(self) -> bool: | |
| return self.client is not None | |
| async def analyze(self, ocr_fields: dict, document_type: str) -> dict: | |
| """ | |
| Analyze OCR-extracted fields using LLM. | |
| Args: | |
| ocr_fields: The dict returned by OCR (without _provider key). | |
| document_type: e.g. "medical_report", "income_proof". | |
| Returns: | |
| dict matching DocumentAnalysis schema, or a graceful fallback. | |
| """ | |
| if not self.client: | |
| return self._unavailable_fallback(document_type) | |
| # Remove internal keys before sending to LLM | |
| clean_fields = {k: v for k, v in ocr_fields.items() if not k.startswith("_")} | |
| # Count null fields for a quick completeness hint | |
| null_count = sum(1 for v in clean_fields.values() if v is None) | |
| total_fields = len(clean_fields) or 1 | |
| prompt = _BASE_PROMPT.format( | |
| doc_type=document_type, | |
| fields_json=json.dumps(clean_fields, ensure_ascii=False, indent=2), | |
| ) | |
| try: | |
| response = await self.client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.1, # Low temp โ deterministic structured output | |
| max_tokens=512, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| # Strip markdown fences if present | |
| if "```json" in raw: | |
| raw = raw.split("```json")[1].split("```")[0].strip() | |
| elif "```" in raw: | |
| raw = raw.split("```")[1].split("```")[0].strip() | |
| result = json.loads(raw) | |
| # Validate required keys โ fill missing with defaults | |
| result.setdefault("summary", "ุชุนุฐูุฑ ุฅูุดุงุก ุงูู ูุฎุต.") | |
| result.setdefault("risk_level", "medium") | |
| result.setdefault("severity_score", 50) | |
| result.setdefault("key_findings", []) | |
| result.setdefault("recommendation", "ููุฑุฌู ู ุฑุงุฌุนุฉ ุงููุซููุฉ ูุฏููุงู.") | |
| result.setdefault("confidence", round(1 - (null_count / total_fields), 2)) | |
| result.setdefault("tampering_detected", False) | |
| # If tampering is detected, override recommendation | |
| if result.get("tampering_detected"): | |
| result["recommendation"] = "๐จ ุชุญุฐูุฑ: ุงุดุชุจุงู ูู ุชูุงุนุจ ุจุงููุซููุฉ. " + result["recommendation"] | |
| # Clamp severity_score | |
| result["severity_score"] = max(0, min(100, int(result["severity_score"]))) | |
| result["risk_level"] = result["risk_level"].lower() | |
| if result["risk_level"] not in ("high", "medium", "low"): | |
| result["risk_level"] = "medium" | |
| return result | |
| except json.JSONDecodeError as e: | |
| logger.error("LLM returned non-JSON response: %s | raw=%s", e, raw[:200]) | |
| return self._parse_error_fallback(document_type) | |
| except Exception as e: | |
| logger.error("Document analysis failed: %s", e, exc_info=True) | |
| return self._error_fallback(document_type) | |
| # โโโ Fallbacks โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _unavailable_fallback(doc_type: str) -> dict: | |
| return { | |
| "summary": "ุฎุฏู ุฉ ุงูุชุญููู ุบูุฑ ู ุชุงุญุฉ ุญุงููุงู.", | |
| "risk_level": "medium", | |
| "severity_score": 50, | |
| "key_findings": ["ูู ูุชู ุฅุฌุฑุงุก ุงูุชุญููู โ GROQ_API_KEY ุบูุฑ ู ุถุจูุท."], | |
| "recommendation": "ููุฑุฌู ุถุจุท GROQ_API_KEY ูุฅุนุงุฏุฉ ุงูู ุญุงููุฉ.", | |
| "confidence": 0.0, | |
| "tampering_detected": False, | |
| } | |
| def _parse_error_fallback(doc_type: str) -> dict: | |
| return { | |
| "summary": "ุชุนุฐูุฑ ุชุญููู ุงููุซููุฉ โ ุงูุฑุฏ ุบูุฑ ุตุงูุญ.", | |
| "risk_level": "medium", | |
| "severity_score": 50, | |
| "key_findings": ["ูุดู ุชุญููู ุฑุฏ ุงููู ูุฐุฌ โ ููุฑุฌู ุงูู ุฑุงุฌุนุฉ ุงููุฏููุฉ."], | |
| "recommendation": "ุฑุงุฌุน ุงูุจูุงูุงุช ุงูู ุณุชุฎุฑุฌุฉ ูุฏููุงู.", | |
| "confidence": 0.0, | |
| "tampering_detected": False, | |
| } | |
| def _error_fallback(doc_type: str) -> dict: | |
| return { | |
| "summary": "ุญุฏุซ ุฎุทุฃ ุฃุซูุงุก ุงูุชุญููู.", | |
| "risk_level": "medium", | |
| "severity_score": 50, | |
| "key_findings": ["ุชุนุฐูุฑ ุงูุชุญููู ุจุณุจุจ ุฎุทุฃ ุชููู."], | |
| "recommendation": "ููุฑุฌู ุงูู ุญุงููุฉ ู ุฌุฏุฏุงู ุฃู ุงูู ุฑุงุฌุนุฉ ุงููุฏููุฉ.", | |
| "confidence": 0.0, | |
| "tampering_detected": False, | |
| } | |
| # Singleton | |
| document_analyzer = DocumentAnalyzer() | |