Spaces:

Muthukumarank
/

forensic-triage-intelligence

Sleeping

File size: 12,343 Bytes

c0e4f03

"""
Module 1: AI-Based Autopsy Report Analysis
==========================================
Extracts forensic entities from unstructured autopsy reports using
pattern-based NLP with forensic-specific rules.
"""

import re
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass


@dataclass
class ForensicEntity:
    text: str
    label: str
    start: int
    end: int
    confidence: float
    context: str = ""


class AutopsyAnalyzer:
    """NLP-based forensic entity extractor for autopsy reports."""

    def __init__(self):
        self._compile_patterns()

    def _compile_patterns(self):
        """Compile regex patterns for forensic entity extraction."""
        self.patterns = {
            "CAUSE_OF_DEATH": [
                r"(?i)cause\s+of\s+death[:\s]*([^\n.]{5,120})",
                r"(?i)died\s+(?:of|from|due\s+to)\s+([^\n.]+)",
                r"(?i)fatal\s+([^\n.,]+(?:injury|trauma|hemorrhage|asphyxia|poisoning|failure))",
                r"(?i)(?:primary|immediate|underlying)\s+cause[:\s]*([^\n.]+)",
                r"(?i)(asphyxia\s+due\s+to\s+[^\n.,]+)",
                r"(?i)(blunt force (?:head )?trauma[^\n.,]{0,50})",
            ],
            "MANNER_OF_DEATH": [
                r"(?i)manner\s+of\s+death[:\s]*(homicide|suicide|accident(?:al)?|natural|undetermined|pending)",
            ],
            "INJURY": [
                r"(?i)(blunt\s+force\s+trauma[^\n.,]{0,80})",
                r"(?i)(sharp\s+force\s+(?:injury|trauma|wound)[^\n.,]{0,60})",
                r"(?i)(gunshot\s+wound[^\n.,]{0,60})",
                r"(?i)(stab\s+wound[^\n.,]{0,60})",
                r"(?i)(laceration[s]?[^\n.,]{0,60})",
                r"(?i)(contusion[s]?\s+(?:on|of|to)\s+[^\n.,]{0,60})",
                r"(?i)(abrasion[s]?[^\n.,]{0,60})",
                r"(?i)(fracture[s]?[^\n.,]{0,60})",
                r"(?i)(subdural\s+hematoma[^\n.,]{0,60})",
                r"(?i)(defensive\s+wounds?[^\n.,]{0,80})",
                r"(?i)(petechial\s+hemorrhages?[^\n.,]{0,60})",
                r"(?i)(ligature\s+mark[^\n.,]{0,80})",
            ],
            "TOXICOLOGY": [
                r"(?i)(blood\s+alcohol[:\s]*\d+\.\d+\s*g/dL[^\n.,]*)",
                r"(?i)(benzodiazepines?[:\s]*[^\n.,]{0,60})",
                r"(?i)((?:cocaine|heroin|methamphetamine|fentanyl|morphine|diazepam)[^\n.,]{0,60})",
                r"(?i)(no\s+illicit\s+substances?\s+detected)",
                r"(?i)(trace\s+levels?\s*[-–]?\s*[^\n.,]{0,40})",
            ],
            "TIME_INDICATOR": [
                r"(?i)((?:approximately|about|estimated)\s+\d+[-–]\d+\s+hours?\s+(?:prior|before|after)[^\n.,]*)",
                r"(?i)((?:March|April|May|June|July|August|September|October|November|December|January|February)\s+\d{1,2},?\s*\d{4}[,\s]*\d{1,2}:\d{2}\s*(?:AM|PM)?)",
                r"(?i)(rigor\s+mortis\s+is\s+fully\s+developed[^\n.,]{0,40})",
                r"(?i)(lividity\s+is\s+fixed[^\n.,]{0,40})",
                r"(?i)(time\s+of\s+death\s+estimated[^\n.,]{0,80})",
            ],
            "ANATOMICAL": [
                r"(?i)((?:right|left)\s+temporal\s+region[^\n.,]{0,30})",
                r"(?i)((?:right|left)\s+(?:forearm|shoulder|arm|leg)[^\n.,]{0,30})",
                r"(?i)(conjunctivae\s+bilaterally)",
                r"(?i)((?:right|left)\s+hemisphere[^\n.,]{0,30})",
                r"(?i)(hyoid\s+bone)",
            ],
            "MEDICAL_FINDING": [
                r"(?i)(cerebral\s+edema[^\n.,]*)",
                r"(?i)(lungs?\s+show\s+[^\n.,]+)",
                r"(?i)((?:heart|liver|brain)[:\s]*\d+\s*g[^\n.,]{0,30})",
                r"(?i)(stomach\s+contents[:\s]*[^\n.,]+)",
                r"(?i)(mild\s+fatty\s+changes)",
                r"(?i)(hyoid\s+bone\s+intact)",
            ],
            "DEMOGRAPHIC": [
                r"(?i)((?:well|poorly)[-\s]nourished\s+adult\s+(?:male|female)[^\n.,]{0,40})",
                r"(?i)(approximately\s+\d+[-–]\d+\s+years)",
                r"(?i)(weighing\s+approximately\s+\d+\s*kg)",
                r"(?i)(approximately\s+\d['\u2019]\d+[\"″\u201d]?\s*tall)",
            ],
            "LOCATION": [
                r"(?i)(abandoned\s+warehouse[^\n.,]{0,40})",
                r"(?i)(industrial\s+district[^\n.,]{0,30})",
                r"(?i)(block\s+\d+)",
            ],
            "EVIDENCE": [
                r"(?i)(skin\s+under\s+fingernails\s+collected\s+for\s+DNA\s+analysis)",
                r"(?i)(foreign\s+fibers?\s+recovered[^\n.,]{0,60})",
                r"(?i)(fingerprints?\s+(?:collected|recovered|found|lifted)[^\n.,]{0,40})",
                r"(?i)(synthetic,?\s*(?:blue|red|black|white)[^\n.,]{0,40})",
            ],
        }

    def extract_text_from_file(self, filepath: str) -> str:
        """Extract text from uploaded file."""
        if filepath is None:
            return ""
        if filepath.endswith(".pdf"):
            try:
                import pdfplumber
                with pdfplumber.open(filepath) as pdf:
                    return "\n".join(page.extract_text() or "" for page in pdf.pages)
            except ImportError:
                with open(filepath, "rb") as f:
                    content = f.read()
                    text = content.decode("utf-8", errors="ignore")
                    text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
                    return text
        elif filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        return "[Unsupported file format]"

    def analyze(self, text: str) -> Dict[str, Any]:
        """Perform full NLP analysis on autopsy report text."""
        entities = self._extract_entities(text)
        highlighted = self._build_highlighted_text(text, entities)
        entities_table = self._build_entities_table(entities)
        summary = self._generate_summary(entities, text)
        time_events = self._extract_time_events(entities)

        return {
            "entities": entities,
            "highlighted_text": highlighted,
            "entities_table": entities_table,
            "summary": summary,
            "summary_markdown": self._format_summary_markdown(summary),
            "time_events": time_events,
        }

    def _extract_entities(self, text: str) -> List[ForensicEntity]:
        """Extract all forensic entities from text."""
        entities = []
        seen_spans = set()

        for label, patterns in self.patterns.items():
            for pattern in patterns:
                for match in re.finditer(pattern, text):
                    if match.lastindex and match.lastindex >= 1:
                        entity_text = match.group(1).strip()
                        start = match.start(1)
                        end = match.end(1)
                    else:
                        entity_text = match.group(0).strip()
                        start = match.start(0)
                        end = match.end(0)

                    if len(entity_text) < 3:
                        continue

                    # Check overlap
                    overlapping = False
                    for es, ee in seen_spans:
                        if start < ee and end > es:
                            overlapping = True
                            break
                    if overlapping:
                        continue

                    seen_spans.add((start, end))
                    confidence = min(0.95, 0.75 + len(entity_text) * 0.002)

                    ctx_start = max(0, start - 30)
                    ctx_end = min(len(text), end + 30)
                    context = text[ctx_start:ctx_end].strip()

                    entities.append(ForensicEntity(
                        text=entity_text, label=label,
                        start=start, end=end,
                        confidence=confidence, context=context
                    ))

        entities.sort(key=lambda e: e.start)
        return entities

    def _build_highlighted_text(self, text: str, entities: List[ForensicEntity]) -> List[Tuple[str, str]]:
        """Build highlighted text for Gradio."""
        if not entities:
            return [(text, None)]

        highlighted = []
        last_end = 0

        for entity in entities:
            if entity.start > last_end:
                highlighted.append((text[last_end:entity.start], None))
            highlighted.append((text[entity.start:entity.end], entity.label))
            last_end = entity.end

        if last_end < len(text):
            highlighted.append((text[last_end:], None))

        return highlighted

    def _build_entities_table(self, entities: List[ForensicEntity]) -> List[Dict]:
        """Build structured entities table."""
        return [
            {
                "Entity": e.text[:100],
                "Category": e.label,
                "Confidence": round(e.confidence, 3),
                "Context": e.context[:80]
            }
            for e in entities
        ]

    def _generate_summary(self, entities: List[ForensicEntity], text: str) -> Dict:
        """Generate analysis summary."""
        summary = {
            "total_entities": len(entities),
            "categories": {},
            "cause_of_death": [],
            "manner_of_death": None,
            "injuries": [],
            "toxicology_findings": [],
            "key_time_indicators": [],
            "evidence_collected": [],
        }

        for entity in entities:
            summary["categories"][entity.label] = summary["categories"].get(entity.label, 0) + 1
            if entity.label == "CAUSE_OF_DEATH":
                summary["cause_of_death"].append(entity.text)
            elif entity.label == "MANNER_OF_DEATH":
                summary["manner_of_death"] = entity.text
            elif entity.label == "INJURY":
                summary["injuries"].append(entity.text)
            elif entity.label == "TOXICOLOGY":
                summary["toxicology_findings"].append(entity.text)
            elif entity.label == "TIME_INDICATOR":
                summary["key_time_indicators"].append(entity.text)
            elif entity.label == "EVIDENCE":
                summary["evidence_collected"].append(entity.text)

        return summary

    def _format_summary_markdown(self, summary: Dict) -> str:
        """Format summary as markdown."""
        md = "## 📋 Autopsy Analysis Summary\n\n"
        md += f"**Total Entities Extracted:** {summary['total_entities']}\n\n"

        md += "### Entity Categories\n"
        md += "| Category | Count |\n|----------|-------|\n"
        for cat, count in sorted(summary["categories"].items(), key=lambda x: -x[1]):
            md += f"| {cat} | {count} |\n"
        md += "\n"

        md += "### 🔑 Key Findings\n\n"
        if summary["manner_of_death"]:
            md += f"**Manner of Death:** `{summary['manner_of_death']}`\n\n"
        if summary["cause_of_death"]:
            md += "**Cause(s) of Death:**\n"
            for cod in summary["cause_of_death"]:
                md += f"- {cod}\n"
            md += "\n"
        if summary["injuries"]:
            md += f"**Injuries ({len(summary['injuries'])}):**\n"
            for inj in summary["injuries"][:10]:
                md += f"- {inj}\n"
            md += "\n"
        if summary["toxicology_findings"]:
            md += "**Toxicology:**\n"
            for tox in summary["toxicology_findings"]:
                md += f"- {tox}\n"
            md += "\n"
        if summary["evidence_collected"]:
            md += "**Evidence Collected:**\n"
            for ev in summary["evidence_collected"]:
                md += f"- {ev}\n"
            md += "\n"
        if summary["key_time_indicators"]:
            md += "**Time Indicators:**\n"
            for ti in summary["key_time_indicators"]:
                md += f"- {ti}\n"

        return md

    def _extract_time_events(self, entities: List[ForensicEntity]) -> List[Dict]:
        """Extract timeline events from entities."""
        return [
            {
                "event": e.text,
                "category": "Physical Evidence",
                "source": "Autopsy Report",
                "timestamp": e.text,
            }
            for e in entities if e.label == "TIME_INDICATOR"
        ]