import unicodedata import json import requests import re from bs4 import BeautifulSoup from typing import Union, List, Dict, Optional def clean_text(text: str) -> str: """Clean text from problematic characters.""" if not text: return text replacements = { ''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"', '–': '-', '—': '-', '…': '...', '\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"', '\u2013': '-', '\u2014': '-', '\u2026': '...', '\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'", '\u0301': "", '\u0060': "'", '\u00B4': "'" } try: text = unicodedata.normalize('NFKD', text) for old, new in replacements.items(): text = text.replace(old, new) text = ' '.join(text.split()) text = ''.join(char for char in text if not unicodedata.category(char).startswith('C')) return text except Exception as e: print(f"Error in clean_text: {str(e)}") return text def extract_court_decision_text(url: str) -> str: """Extract text from court decision URL - специфічно для reyestr.court.gov.ua.""" try: # Add headers and timeout for better reliability headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() except requests.RequestException as e: raise Exception(f"Помилка при завантаженні URL: {str(e)}") soup = BeautifulSoup(response.content, 'html.parser') unwanted_texts = [ "Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.", "З метою упередження перешкоджанню стабільній роботі Реєстру" ] result = "" # Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific) txtdepository = soup.find('textarea', id='txtdepository') if txtdepository: # The textarea contains HTML content as text embedded_html = txtdepository.get_text() # Parse the embedded HTML embedded_soup = BeautifulSoup(embedded_html, 'html.parser') # Extract text from paragraphs paragraphs = [] for p in embedded_soup.find_all('p'): p_text = p.get_text(separator=" ").strip() # Replace   with spaces p_text = p_text.replace('\xa0', ' ').replace(' ', ' ') if p_text and len(p_text) > 10: # Skip very short paragraphs paragraphs.append(p_text) if paragraphs: result = "\n\n".join(paragraphs) # Strategy 2: Try to find paragraphs directly (fallback) if not result or len(result) < 100: decision_text = [] for paragraph in soup.find_all('p'): text = paragraph.get_text(separator="\n").strip() if not any(unwanted_text in text for unwanted_text in unwanted_texts): decision_text.append(text) result = "\n".join(decision_text).strip() # Strategy 3: If still nothing, try wordwrap div if not result or len(result) < 100: wordwrap = soup.find('div', class_='wordwrap') if wordwrap: result = wordwrap.get_text(separator="\n").strip() # Clean up the result if result: lines = result.split('\n') cleaned_lines = [ line.strip() for line in lines if line.strip() and len(line.strip()) > 5 and not any(unwanted in line for unwanted in unwanted_texts) ] result = '\n'.join(cleaned_lines) print(f"[DEBUG] Extracted {len(result)} characters from URL") if not result or len(result) < 100: raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.") return result def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]: """Parse document IDs from various input formats.""" if doc_ids is None: return [] if isinstance(doc_ids, list): return [str(id).strip('[]') for id in doc_ids] if isinstance(doc_ids, str): cleaned = doc_ids.strip('[]').replace(' ', '') if cleaned: return [id.strip() for id in cleaned.split(',')] return [] def get_links_html(doc_ids: Union[List, str, None]) -> str: """Generate HTML links for document IDs.""" parsed_ids = parse_doc_ids(doc_ids) if not parsed_ids: return "" links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})" for doc_id in parsed_ids] return ", ".join(links) def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]: """Parse legal position IDs.""" if lp_ids is None: return [] if isinstance(lp_ids, (str, int)): cleaned = str(lp_ids).strip('[]').replace(' ', '') if cleaned: return [cleaned] return [] def get_links_html_lp(lp_ids: Union[str, int, None]) -> str: """Generate HTML links for legal position IDs.""" parsed_ids = parse_lp_ids(lp_ids) if not parsed_ids: return "" links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})" for lp_id in parsed_ids] return ", ".join(links) def extract_json_from_text(text: str) -> Optional[Dict]: """Extract and parse JSON from text, handling markdown blocks and other noise.""" if not text: return None try: # 1. Try direct parsing return json.loads(text.strip()) except json.JSONDecodeError: pass # 2. Try to find JSON within markdown or other text text_to_parse = text.strip() # Remove markdown code blocks with triple backticks or triple single quotes for delimiter in ["```json", "'''json", "```", "'''"]: if delimiter in text_to_parse: try: parts = text_to_parse.split(delimiter) if len(parts) > 1: # Take the first content block after the delimiter candidate = parts[1].split(delimiter.replace("json", ""))[0].strip() if candidate: text_to_parse = candidate break except Exception: continue try: return json.loads(text_to_parse) except json.JSONDecodeError: pass # 3. Last resort: find the first { and last } # Try to balance braces to handle potential truncation or trailing noise start_idx = text_to_parse.find('{') if start_idx != -1: # Step backwards from the end to find the last valid-looking closing brace for end_idx in range(len(text_to_parse) - 1, start_idx, -1): if text_to_parse[end_idx] == '}': candidate = text_to_parse[start_idx:end_idx + 1] try: return json.loads(candidate) except json.JSONDecodeError: continue return None