File size: 6,412 Bytes
968e24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
Enhanced Judgment Segmenter (FIXED)
Segments judgments into: Facts, Issues, Arguments, Analysis, Decision
"""

import re
import os
import logging
from typing import List, Dict, Tuple
from dataclasses import dataclass

try:
    from transformers import pipeline
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False

logger = logging.getLogger(__name__)


@dataclass
class Section:
    type: str                  # facts/issues/arguments/analysis/decision/unknown
    text: str
    start_para_idx: int
    end_para_idx: int
    confidence: float


class JudgmentSegmenter:

    MARKERS = {
        'facts': [
            r'\bbrief\s+facts?\b',
            r'\bfactual\s+(matrix|background)\b',
            r'\bcircumstances\s+of\s+the\s+case\b',
            r'\bbackground\b',
        ],
        'issues': [
            r'\bissues?\s+(for|of)\s+(consideration|determination)\b',
            r'\bsubstantial\s+questions?\b',
            r'\bpoints?\s+for\s+consideration\b',
            r'\bquestions?\s+framed\b',
        ],
        'arguments': [
            r'\blearned\s+counsel\b',
            r'\bsubmissions?\b',
            r'\b(argued|submitted|contended)\b',
            r'\bon\s+behalf\s+of\b',
        ],
        'analysis': [
            r'\bwe\s+have\s+(considered|examined|analysed)\b',
            r'\bthe\s+court\s+(finds|observes|notes|holds)\b',
            r'\bin\s+our\s+(view|opinion)\b',
            r'\bit\s+is\s+clear\s+that\b',
        ],
        'decision': [
            r'\b(appeal|petition|writ)\s+is\s+(allowed|dismissed)\b',
            r'\baccordingly\b',
            r'\bwe\s+direct\b',
            r'\bheld\s*:\b',
            r'\border\b',
        ]
    }

    def __init__(self, model_path: str = "models/segmentation_model"):
        """Initialize segmenter, preferring ML model if available, else Regex fallback"""
        self.use_ml = False
        self.classifier = None
        
        if TRANSFORMERS_AVAILABLE and os.path.exists(model_path):
            try:
                logger.info(f"Loading ML Segmentation model from {model_path}...")
                self.classifier = pipeline("text-classification", model=model_path, device=-1)
                self.use_ml = True
                logger.info("✓ ML Segmenter loaded successfully.")
            except Exception as e:
                logger.warning(f"Failed to load ML model, falling back to Regex: {e}")
        else:
            logger.info("ML model not found or transformers not installed. Using Regex fallback.")

    def detect_section(self, para: str, position_ratio: float) -> Tuple[str, float]:
        """
        Detect section type for a paragraph
        Returns: (section_type, confidence)
        """
        para_lower = para.lower()
        best_type = 'unknown'
        best_conf = 0.0

        for sec_type, patterns in self.MARKERS.items():
            for pattern in patterns:
                if re.search(pattern, para_lower):
                    conf = 0.6

                    # Position-based bias
                    if sec_type == 'facts' and position_ratio < 0.30:
                        conf += 0.2
                    elif sec_type == 'decision' and position_ratio > 0.70:
                        conf += 0.3

                    # Strong anchor near paragraph start
                    if re.search(pattern, para_lower[:120]):
                        conf += 0.2

                    conf = min(conf, 1.0)

                    if conf > best_conf:
                        best_type = sec_type
                        best_conf = conf

        return best_type, best_conf
        
    def detect_section_ml(self, para: str) -> Tuple[str, float]:
        """Detect using HuggingFace classifier"""
        if not para.strip() or not self.classifier:
            return "unknown", 0.0
            
        # Truncate to max length to avoid tokenization errors
        truncated = para[:512]
        result = self.classifier(truncated)[0]
        
        # Assume labels are like LABEL_FACTS, LABEL_ISSUES or directly facts, issues
        label = result['label'].lower().replace('label_', '')
        score = result['score']
        
        # Enforce confidence threshold
        if score < 0.5:
            return "unknown", score
            
        return label, score

    def segment(self, paragraph_texts: List[str]) -> List[Section]:
        """
        Segment judgment based on paragraph list (INDEX-ALIGNED)
        """
        if not paragraph_texts:
            return []

        sections: List[Section] = []

        current_type = 'unknown'
        current_paras = []
        current_conf = 0.0
        start_idx = 0

        total = len(paragraph_texts)

        for i, para in enumerate(paragraph_texts):
            position_ratio = i / max(total, 1)
            
            if self.use_ml:
                sec_type, conf = self.detect_section_ml(para)
            else:
                sec_type, conf = self.detect_section(para, position_ratio)

            # Fallback: early unknown paragraphs are likely facts
            if sec_type == 'unknown' and position_ratio < 0.30 and i > 0:
                sec_type = 'facts'
                conf = 0.4

            # Section boundary
            if conf > 0.4 and sec_type != current_type:
                if current_paras:
                    sections.append(
                        Section(
                            type=current_type,
                            text="\n\n".join(current_paras),
                            start_para_idx=start_idx,
                            end_para_idx=i - 1,
                            confidence=round(current_conf, 2)
                        )
                    )

                current_type = sec_type
                current_paras = [para]
                current_conf = conf
                start_idx = i
            else:
                current_paras.append(para)
                current_conf = max(current_conf, conf)

        # Final section
        if current_paras:
            sections.append(
                Section(
                    type=current_type,
                    text="\n\n".join(current_paras),
                    start_para_idx=start_idx,
                    end_para_idx=total - 1,
                    confidence=round(current_conf, 2)
                )
            )

        return sections