Spaces:

DocSA
/

LP_2-test

Running

File size: 7,343 Bytes

import unicodedata
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import Union, List, Dict, Optional

def clean_text(text: str) -> str:
    """Clean text from problematic characters."""
    if not text:
        return text

    replacements = {
        ''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"',
        '–': '-', '—': '-', '…': '...',
        '\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"',
        '\u2013': '-', '\u2014': '-', '\u2026': '...',
        '\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'",
        '\u0301': "", '\u0060': "'", '\u00B4': "'"
    }

    try:
        text = unicodedata.normalize('NFKD', text)
        for old, new in replacements.items():
            text = text.replace(old, new)
        text = ' '.join(text.split())
        text = ''.join(char for char in text
                      if not unicodedata.category(char).startswith('C'))
        return text
    except Exception as e:
        print(f"Error in clean_text: {str(e)}")
        return text

def extract_court_decision_text(url: str) -> str:
    """Extract text from court decision URL - специфічно для reyestr.court.gov.ua."""
    try:
        # Add headers and timeout for better reliability
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        raise Exception(f"Помилка при завантаженні URL: {str(e)}")

    soup = BeautifulSoup(response.content, 'html.parser')

    unwanted_texts = [
        "Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
        "З метою упередження перешкоджанню стабільній роботі Реєстру"
    ]

    result = ""

    # Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific)
    txtdepository = soup.find('textarea', id='txtdepository')
    if txtdepository:
        # The textarea contains HTML content as text
        embedded_html = txtdepository.get_text()
        # Parse the embedded HTML
        embedded_soup = BeautifulSoup(embedded_html, 'html.parser')
        # Extract text from paragraphs
        paragraphs = []
        for p in embedded_soup.find_all('p'):
            p_text = p.get_text(separator=" ").strip()
            # Replace &nbsp; with spaces
            p_text = p_text.replace('\xa0', ' ').replace('&nbsp;', ' ')
            if p_text and len(p_text) > 10:  # Skip very short paragraphs
                paragraphs.append(p_text)
        if paragraphs:
            result = "\n\n".join(paragraphs)

    # Strategy 2: Try to find paragraphs directly (fallback)
    if not result or len(result) < 100:
        decision_text = []
        for paragraph in soup.find_all('p'):
            text = paragraph.get_text(separator="\n").strip()
            if not any(unwanted_text in text for unwanted_text in unwanted_texts):
                decision_text.append(text)
        result = "\n".join(decision_text).strip()

    # Strategy 3: If still nothing, try wordwrap div
    if not result or len(result) < 100:
        wordwrap = soup.find('div', class_='wordwrap')
        if wordwrap:
            result = wordwrap.get_text(separator="\n").strip()

    # Clean up the result
    if result:
        lines = result.split('\n')
        cleaned_lines = [
            line.strip() for line in lines
            if line.strip() and len(line.strip()) > 5
            and not any(unwanted in line for unwanted in unwanted_texts)
        ]
        result = '\n'.join(cleaned_lines)

    print(f"[DEBUG] Extracted {len(result)} characters from URL")

    if not result or len(result) < 100:
        raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.")

    return result

def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]:
    """Parse document IDs from various input formats."""
    if doc_ids is None:
        return []
    if isinstance(doc_ids, list):
        return [str(id).strip('[]') for id in doc_ids]
    if isinstance(doc_ids, str):
        cleaned = doc_ids.strip('[]').replace(' ', '')
        if cleaned:
            return [id.strip() for id in cleaned.split(',')]
    return []

def get_links_html(doc_ids: Union[List, str, None]) -> str:
    """Generate HTML links for document IDs."""
    parsed_ids = parse_doc_ids(doc_ids)
    if not parsed_ids:
        return ""
    links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
             for doc_id in parsed_ids]
    return ", ".join(links)

def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]:
    """Parse legal position IDs."""
    if lp_ids is None:
        return []
    if isinstance(lp_ids, (str, int)):
        cleaned = str(lp_ids).strip('[]').replace(' ', '')
        if cleaned:
            return [cleaned]
    return []

def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
    """Generate HTML links for legal position IDs."""
    parsed_ids = parse_lp_ids(lp_ids)
    if not parsed_ids:
        return ""
    links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
             for lp_id in parsed_ids]
    return ", ".join(links)

def extract_json_from_text(text: str) -> Optional[Dict]:
    """Extract and parse JSON from text, handling markdown blocks and other noise."""
    if not text:
        return None
    
    try:
        # 1. Try direct parsing
        return json.loads(text.strip())
    except json.JSONDecodeError:
        pass
    
    # 2. Try to find JSON within markdown or other text
    text_to_parse = text.strip()
    
    # Remove markdown code blocks with triple backticks or triple single quotes
    for delimiter in ["```json", "'''json", "```", "'''"]:
        if delimiter in text_to_parse:
            try:
                parts = text_to_parse.split(delimiter)
                if len(parts) > 1:
                    # Take the first content block after the delimiter
                    candidate = parts[1].split(delimiter.replace("json", ""))[0].strip()
                    if candidate:
                        text_to_parse = candidate
                        break
            except Exception:
                continue
    
    try:
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        pass
    
    # 3. Last resort: find the first { and last }
    # Try to balance braces to handle potential truncation or trailing noise
    start_idx = text_to_parse.find('{')
    if start_idx != -1:
        # Step backwards from the end to find the last valid-looking closing brace
        for end_idx in range(len(text_to_parse) - 1, start_idx, -1):
            if text_to_parse[end_idx] == '}':
                candidate = text_to_parse[start_idx:end_idx + 1]
                try:
                    return json.loads(candidate)
                except json.JSONDecodeError:
                    continue
            
    return None