File size: 7,343 Bytes
461adca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59fabbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db52560
 
 
 
 
 
 
 
 
 
 
 
 
59fabbc
 
 
 
 
 
 
db52560
59fabbc
db52560
 
 
 
 
 
 
 
 
59fabbc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import unicodedata
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import Union, List, Dict, Optional

def clean_text(text: str) -> str:
    """Clean text from problematic characters."""
    if not text:
        return text

    replacements = {
        ''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"',
        '–': '-', '—': '-', '…': '...',
        '\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"',
        '\u2013': '-', '\u2014': '-', '\u2026': '...',
        '\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'",
        '\u0301': "", '\u0060': "'", '\u00B4': "'"
    }

    try:
        text = unicodedata.normalize('NFKD', text)
        for old, new in replacements.items():
            text = text.replace(old, new)
        text = ' '.join(text.split())
        text = ''.join(char for char in text
                      if not unicodedata.category(char).startswith('C'))
        return text
    except Exception as e:
        print(f"Error in clean_text: {str(e)}")
        return text

def extract_court_decision_text(url: str) -> str:
    """Extract text from court decision URL - специфічно для reyestr.court.gov.ua."""
    try:
        # Add headers and timeout for better reliability
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        raise Exception(f"Помилка при завантаженні URL: {str(e)}")

    soup = BeautifulSoup(response.content, 'html.parser')

    unwanted_texts = [
        "Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
        "З метою упередження перешкоджанню стабільній роботі Реєстру"
    ]

    result = ""

    # Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific)
    txtdepository = soup.find('textarea', id='txtdepository')
    if txtdepository:
        # The textarea contains HTML content as text
        embedded_html = txtdepository.get_text()
        # Parse the embedded HTML
        embedded_soup = BeautifulSoup(embedded_html, 'html.parser')
        # Extract text from paragraphs
        paragraphs = []
        for p in embedded_soup.find_all('p'):
            p_text = p.get_text(separator=" ").strip()
            # Replace   with spaces
            p_text = p_text.replace('\xa0', ' ').replace(' ', ' ')
            if p_text and len(p_text) > 10:  # Skip very short paragraphs
                paragraphs.append(p_text)
        if paragraphs:
            result = "\n\n".join(paragraphs)

    # Strategy 2: Try to find paragraphs directly (fallback)
    if not result or len(result) < 100:
        decision_text = []
        for paragraph in soup.find_all('p'):
            text = paragraph.get_text(separator="\n").strip()
            if not any(unwanted_text in text for unwanted_text in unwanted_texts):
                decision_text.append(text)
        result = "\n".join(decision_text).strip()

    # Strategy 3: If still nothing, try wordwrap div
    if not result or len(result) < 100:
        wordwrap = soup.find('div', class_='wordwrap')
        if wordwrap:
            result = wordwrap.get_text(separator="\n").strip()

    # Clean up the result
    if result:
        lines = result.split('\n')
        cleaned_lines = [
            line.strip() for line in lines
            if line.strip() and len(line.strip()) > 5
            and not any(unwanted in line for unwanted in unwanted_texts)
        ]
        result = '\n'.join(cleaned_lines)

    print(f"[DEBUG] Extracted {len(result)} characters from URL")

    if not result or len(result) < 100:
        raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.")

    return result

def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]:
    """Parse document IDs from various input formats."""
    if doc_ids is None:
        return []
    if isinstance(doc_ids, list):
        return [str(id).strip('[]') for id in doc_ids]
    if isinstance(doc_ids, str):
        cleaned = doc_ids.strip('[]').replace(' ', '')
        if cleaned:
            return [id.strip() for id in cleaned.split(',')]
    return []

def get_links_html(doc_ids: Union[List, str, None]) -> str:
    """Generate HTML links for document IDs."""
    parsed_ids = parse_doc_ids(doc_ids)
    if not parsed_ids:
        return ""
    links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
             for doc_id in parsed_ids]
    return ", ".join(links)

def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]:
    """Parse legal position IDs."""
    if lp_ids is None:
        return []
    if isinstance(lp_ids, (str, int)):
        cleaned = str(lp_ids).strip('[]').replace(' ', '')
        if cleaned:
            return [cleaned]
    return []

def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
    """Generate HTML links for legal position IDs."""
    parsed_ids = parse_lp_ids(lp_ids)
    if not parsed_ids:
        return ""
    links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
             for lp_id in parsed_ids]
    return ", ".join(links)

def extract_json_from_text(text: str) -> Optional[Dict]:
    """Extract and parse JSON from text, handling markdown blocks and other noise."""
    if not text:
        return None
    
    try:
        # 1. Try direct parsing
        return json.loads(text.strip())
    except json.JSONDecodeError:
        pass
    
    # 2. Try to find JSON within markdown or other text
    text_to_parse = text.strip()
    
    # Remove markdown code blocks with triple backticks or triple single quotes
    for delimiter in ["```json", "'''json", "```", "'''"]:
        if delimiter in text_to_parse:
            try:
                parts = text_to_parse.split(delimiter)
                if len(parts) > 1:
                    # Take the first content block after the delimiter
                    candidate = parts[1].split(delimiter.replace("json", ""))[0].strip()
                    if candidate:
                        text_to_parse = candidate
                        break
            except Exception:
                continue
    
    try:
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        pass
    
    # 3. Last resort: find the first { and last }
    # Try to balance braces to handle potential truncation or trailing noise
    start_idx = text_to_parse.find('{')
    if start_idx != -1:
        # Step backwards from the end to find the last valid-looking closing brace
        for end_idx in range(len(text_to_parse) - 1, start_idx, -1):
            if text_to_parse[end_idx] == '}':
                candidate = text_to_parse[start_idx:end_idx + 1]
                try:
                    return json.loads(candidate)
                except json.JSONDecodeError:
                    continue
            
    return None