| """ |
| Module 1: AI-Based Autopsy Report Analysis |
| ========================================== |
| Extracts forensic entities from unstructured autopsy reports using |
| pattern-based NLP with forensic-specific rules. |
| """ |
|
|
| import re |
| from typing import Dict, List, Tuple, Any |
| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class ForensicEntity: |
| text: str |
| label: str |
| start: int |
| end: int |
| confidence: float |
| context: str = "" |
|
|
|
|
| class AutopsyAnalyzer: |
| """NLP-based forensic entity extractor for autopsy reports.""" |
|
|
| def __init__(self): |
| self._compile_patterns() |
|
|
| def _compile_patterns(self): |
| """Compile regex patterns for forensic entity extraction.""" |
| self.patterns = { |
| "CAUSE_OF_DEATH": [ |
| r"(?i)cause\s+of\s+death[:\s]*([^\n.]{5,120})", |
| r"(?i)died\s+(?:of|from|due\s+to)\s+([^\n.]+)", |
| r"(?i)fatal\s+([^\n.,]+(?:injury|trauma|hemorrhage|asphyxia|poisoning|failure))", |
| r"(?i)(?:primary|immediate|underlying)\s+cause[:\s]*([^\n.]+)", |
| r"(?i)(asphyxia\s+due\s+to\s+[^\n.,]+)", |
| r"(?i)(blunt force (?:head )?trauma[^\n.,]{0,50})", |
| ], |
| "MANNER_OF_DEATH": [ |
| r"(?i)manner\s+of\s+death[:\s]*(homicide|suicide|accident(?:al)?|natural|undetermined|pending)", |
| ], |
| "INJURY": [ |
| r"(?i)(blunt\s+force\s+trauma[^\n.,]{0,80})", |
| r"(?i)(sharp\s+force\s+(?:injury|trauma|wound)[^\n.,]{0,60})", |
| r"(?i)(gunshot\s+wound[^\n.,]{0,60})", |
| r"(?i)(stab\s+wound[^\n.,]{0,60})", |
| r"(?i)(laceration[s]?[^\n.,]{0,60})", |
| r"(?i)(contusion[s]?\s+(?:on|of|to)\s+[^\n.,]{0,60})", |
| r"(?i)(abrasion[s]?[^\n.,]{0,60})", |
| r"(?i)(fracture[s]?[^\n.,]{0,60})", |
| r"(?i)(subdural\s+hematoma[^\n.,]{0,60})", |
| r"(?i)(defensive\s+wounds?[^\n.,]{0,80})", |
| r"(?i)(petechial\s+hemorrhages?[^\n.,]{0,60})", |
| r"(?i)(ligature\s+mark[^\n.,]{0,80})", |
| ], |
| "TOXICOLOGY": [ |
| r"(?i)(blood\s+alcohol[:\s]*\d+\.\d+\s*g/dL[^\n.,]*)", |
| r"(?i)(benzodiazepines?[:\s]*[^\n.,]{0,60})", |
| r"(?i)((?:cocaine|heroin|methamphetamine|fentanyl|morphine|diazepam)[^\n.,]{0,60})", |
| r"(?i)(no\s+illicit\s+substances?\s+detected)", |
| r"(?i)(trace\s+levels?\s*[-–]?\s*[^\n.,]{0,40})", |
| ], |
| "TIME_INDICATOR": [ |
| r"(?i)((?:approximately|about|estimated)\s+\d+[-–]\d+\s+hours?\s+(?:prior|before|after)[^\n.,]*)", |
| r"(?i)((?:March|April|May|June|July|August|September|October|November|December|January|February)\s+\d{1,2},?\s*\d{4}[,\s]*\d{1,2}:\d{2}\s*(?:AM|PM)?)", |
| r"(?i)(rigor\s+mortis\s+is\s+fully\s+developed[^\n.,]{0,40})", |
| r"(?i)(lividity\s+is\s+fixed[^\n.,]{0,40})", |
| r"(?i)(time\s+of\s+death\s+estimated[^\n.,]{0,80})", |
| ], |
| "ANATOMICAL": [ |
| r"(?i)((?:right|left)\s+temporal\s+region[^\n.,]{0,30})", |
| r"(?i)((?:right|left)\s+(?:forearm|shoulder|arm|leg)[^\n.,]{0,30})", |
| r"(?i)(conjunctivae\s+bilaterally)", |
| r"(?i)((?:right|left)\s+hemisphere[^\n.,]{0,30})", |
| r"(?i)(hyoid\s+bone)", |
| ], |
| "MEDICAL_FINDING": [ |
| r"(?i)(cerebral\s+edema[^\n.,]*)", |
| r"(?i)(lungs?\s+show\s+[^\n.,]+)", |
| r"(?i)((?:heart|liver|brain)[:\s]*\d+\s*g[^\n.,]{0,30})", |
| r"(?i)(stomach\s+contents[:\s]*[^\n.,]+)", |
| r"(?i)(mild\s+fatty\s+changes)", |
| r"(?i)(hyoid\s+bone\s+intact)", |
| ], |
| "DEMOGRAPHIC": [ |
| r"(?i)((?:well|poorly)[-\s]nourished\s+adult\s+(?:male|female)[^\n.,]{0,40})", |
| r"(?i)(approximately\s+\d+[-–]\d+\s+years)", |
| r"(?i)(weighing\s+approximately\s+\d+\s*kg)", |
| r"(?i)(approximately\s+\d['\u2019]\d+[\"″\u201d]?\s*tall)", |
| ], |
| "LOCATION": [ |
| r"(?i)(abandoned\s+warehouse[^\n.,]{0,40})", |
| r"(?i)(industrial\s+district[^\n.,]{0,30})", |
| r"(?i)(block\s+\d+)", |
| ], |
| "EVIDENCE": [ |
| r"(?i)(skin\s+under\s+fingernails\s+collected\s+for\s+DNA\s+analysis)", |
| r"(?i)(foreign\s+fibers?\s+recovered[^\n.,]{0,60})", |
| r"(?i)(fingerprints?\s+(?:collected|recovered|found|lifted)[^\n.,]{0,40})", |
| r"(?i)(synthetic,?\s*(?:blue|red|black|white)[^\n.,]{0,40})", |
| ], |
| } |
|
|
| def extract_text_from_file(self, filepath: str) -> str: |
| """Extract text from uploaded file.""" |
| if filepath is None: |
| return "" |
| if filepath.endswith(".pdf"): |
| try: |
| import pdfplumber |
| with pdfplumber.open(filepath) as pdf: |
| return "\n".join(page.extract_text() or "" for page in pdf.pages) |
| except ImportError: |
| with open(filepath, "rb") as f: |
| content = f.read() |
| text = content.decode("utf-8", errors="ignore") |
| text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text) |
| return text |
| elif filepath.endswith(".txt"): |
| with open(filepath, "r", encoding="utf-8", errors="ignore") as f: |
| return f.read() |
| return "[Unsupported file format]" |
|
|
| def analyze(self, text: str) -> Dict[str, Any]: |
| """Perform full NLP analysis on autopsy report text.""" |
| entities = self._extract_entities(text) |
| highlighted = self._build_highlighted_text(text, entities) |
| entities_table = self._build_entities_table(entities) |
| summary = self._generate_summary(entities, text) |
| time_events = self._extract_time_events(entities) |
|
|
| return { |
| "entities": entities, |
| "highlighted_text": highlighted, |
| "entities_table": entities_table, |
| "summary": summary, |
| "summary_markdown": self._format_summary_markdown(summary), |
| "time_events": time_events, |
| } |
|
|
| def _extract_entities(self, text: str) -> List[ForensicEntity]: |
| """Extract all forensic entities from text.""" |
| entities = [] |
| seen_spans = set() |
|
|
| for label, patterns in self.patterns.items(): |
| for pattern in patterns: |
| for match in re.finditer(pattern, text): |
| if match.lastindex and match.lastindex >= 1: |
| entity_text = match.group(1).strip() |
| start = match.start(1) |
| end = match.end(1) |
| else: |
| entity_text = match.group(0).strip() |
| start = match.start(0) |
| end = match.end(0) |
|
|
| if len(entity_text) < 3: |
| continue |
|
|
| |
| overlapping = False |
| for es, ee in seen_spans: |
| if start < ee and end > es: |
| overlapping = True |
| break |
| if overlapping: |
| continue |
|
|
| seen_spans.add((start, end)) |
| confidence = min(0.95, 0.75 + len(entity_text) * 0.002) |
|
|
| ctx_start = max(0, start - 30) |
| ctx_end = min(len(text), end + 30) |
| context = text[ctx_start:ctx_end].strip() |
|
|
| entities.append(ForensicEntity( |
| text=entity_text, label=label, |
| start=start, end=end, |
| confidence=confidence, context=context |
| )) |
|
|
| entities.sort(key=lambda e: e.start) |
| return entities |
|
|
| def _build_highlighted_text(self, text: str, entities: List[ForensicEntity]) -> List[Tuple[str, str]]: |
| """Build highlighted text for Gradio.""" |
| if not entities: |
| return [(text, None)] |
|
|
| highlighted = [] |
| last_end = 0 |
|
|
| for entity in entities: |
| if entity.start > last_end: |
| highlighted.append((text[last_end:entity.start], None)) |
| highlighted.append((text[entity.start:entity.end], entity.label)) |
| last_end = entity.end |
|
|
| if last_end < len(text): |
| highlighted.append((text[last_end:], None)) |
|
|
| return highlighted |
|
|
| def _build_entities_table(self, entities: List[ForensicEntity]) -> List[Dict]: |
| """Build structured entities table.""" |
| return [ |
| { |
| "Entity": e.text[:100], |
| "Category": e.label, |
| "Confidence": round(e.confidence, 3), |
| "Context": e.context[:80] |
| } |
| for e in entities |
| ] |
|
|
| def _generate_summary(self, entities: List[ForensicEntity], text: str) -> Dict: |
| """Generate analysis summary.""" |
| summary = { |
| "total_entities": len(entities), |
| "categories": {}, |
| "cause_of_death": [], |
| "manner_of_death": None, |
| "injuries": [], |
| "toxicology_findings": [], |
| "key_time_indicators": [], |
| "evidence_collected": [], |
| } |
|
|
| for entity in entities: |
| summary["categories"][entity.label] = summary["categories"].get(entity.label, 0) + 1 |
| if entity.label == "CAUSE_OF_DEATH": |
| summary["cause_of_death"].append(entity.text) |
| elif entity.label == "MANNER_OF_DEATH": |
| summary["manner_of_death"] = entity.text |
| elif entity.label == "INJURY": |
| summary["injuries"].append(entity.text) |
| elif entity.label == "TOXICOLOGY": |
| summary["toxicology_findings"].append(entity.text) |
| elif entity.label == "TIME_INDICATOR": |
| summary["key_time_indicators"].append(entity.text) |
| elif entity.label == "EVIDENCE": |
| summary["evidence_collected"].append(entity.text) |
|
|
| return summary |
|
|
| def _format_summary_markdown(self, summary: Dict) -> str: |
| """Format summary as markdown.""" |
| md = "## 📋 Autopsy Analysis Summary\n\n" |
| md += f"**Total Entities Extracted:** {summary['total_entities']}\n\n" |
|
|
| md += "### Entity Categories\n" |
| md += "| Category | Count |\n|----------|-------|\n" |
| for cat, count in sorted(summary["categories"].items(), key=lambda x: -x[1]): |
| md += f"| {cat} | {count} |\n" |
| md += "\n" |
|
|
| md += "### 🔑 Key Findings\n\n" |
| if summary["manner_of_death"]: |
| md += f"**Manner of Death:** `{summary['manner_of_death']}`\n\n" |
| if summary["cause_of_death"]: |
| md += "**Cause(s) of Death:**\n" |
| for cod in summary["cause_of_death"]: |
| md += f"- {cod}\n" |
| md += "\n" |
| if summary["injuries"]: |
| md += f"**Injuries ({len(summary['injuries'])}):**\n" |
| for inj in summary["injuries"][:10]: |
| md += f"- {inj}\n" |
| md += "\n" |
| if summary["toxicology_findings"]: |
| md += "**Toxicology:**\n" |
| for tox in summary["toxicology_findings"]: |
| md += f"- {tox}\n" |
| md += "\n" |
| if summary["evidence_collected"]: |
| md += "**Evidence Collected:**\n" |
| for ev in summary["evidence_collected"]: |
| md += f"- {ev}\n" |
| md += "\n" |
| if summary["key_time_indicators"]: |
| md += "**Time Indicators:**\n" |
| for ti in summary["key_time_indicators"]: |
| md += f"- {ti}\n" |
|
|
| return md |
|
|
| def _extract_time_events(self, entities: List[ForensicEntity]) -> List[Dict]: |
| """Extract timeline events from entities.""" |
| return [ |
| { |
| "event": e.text, |
| "category": "Physical Evidence", |
| "source": "Autopsy Report", |
| "timestamp": e.text, |
| } |
| for e in entities if e.label == "TIME_INDICATOR" |
| ] |
|
|