LP_2-AI_Assistant / utils.py
Supreme-Court's picture
Update utils.py
ea09a15 verified
import unicodedata
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import Union, List, Dict, Optional
def clean_text(text: str) -> str:
"""Clean text from problematic characters."""
if not text:
return text
replacements = {
''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"',
'–': '-', '—': '-', '…': '...',
'\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"',
'\u2013': '-', '\u2014': '-', '\u2026': '...',
'\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'",
'\u0301': "", '\u0060': "'", '\u00B4': "'"
}
try:
# Normalize to NFKD and handle character replacements
text = unicodedata.normalize('NFKD', text)
# Handle character replacements
for old, new in replacements.items():
text = text.replace(old, new)
# Remove HTML tags and entities
# Specifically targeting </p> <p> and other remnants
text = re.sub(r'</p>\s*<p>', ' ', text, flags=re.IGNORECASE)
text = re.sub(r'<[^>]+>', ' ', text)
# Handle common HTML entities
entities = {
'&nbsp;': ' ', '&quot;': '"', '&amp;': '&',
'&lt;': '<', '&gt;': '>', '&apos;': "'"
}
for ent, rep in entities.items():
text = text.replace(ent, rep)
# Remove control characters and normalize whitespace
text = ' '.join(text.split())
text = ''.join(char for char in text
if not unicodedata.category(char).startswith('C'))
return text
except Exception as e:
print(f"Error in clean_text: {str(e)}")
return text
def extract_court_decision_text(url: str) -> str:
"""Extract text from court decision URL - специфічно для reyestr.court.gov.ua."""
try:
# Add headers and timeout for better reliability
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
except requests.RequestException as e:
raise Exception(f"Помилка при завантаженні URL: {str(e)}")
soup = BeautifulSoup(response.content, 'html.parser')
unwanted_texts = [
"Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
"З метою упередження перешкоджанню стабільній роботі Реєстру"
]
result = ""
# Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific)
txtdepository = soup.find('textarea', id='txtdepository')
if txtdepository:
# The textarea contains HTML content as text
embedded_html = txtdepository.get_text()
# Parse the embedded HTML
embedded_soup = BeautifulSoup(embedded_html, 'html.parser')
# Extract text from paragraphs
paragraphs = []
for p in embedded_soup.find_all('p'):
p_text = p.get_text(separator=" ").strip()
# Replace &nbsp; with spaces
p_text = p_text.replace('\xa0', ' ').replace('&nbsp;', ' ')
if p_text and len(p_text) > 10: # Skip very short paragraphs
paragraphs.append(p_text)
if paragraphs:
result = "\n\n".join(paragraphs)
# Strategy 2: Try to find paragraphs directly (fallback)
if not result or len(result) < 100:
decision_text = []
for paragraph in soup.find_all('p'):
text = paragraph.get_text(separator="\n").strip()
if not any(unwanted_text in text for unwanted_text in unwanted_texts):
decision_text.append(text)
result = "\n".join(decision_text).strip()
# Strategy 3: If still nothing, try wordwrap div
if not result or len(result) < 100:
wordwrap = soup.find('div', class_='wordwrap')
if wordwrap:
result = wordwrap.get_text(separator="\n").strip()
# Clean up the result
if result:
lines = result.split('\n')
cleaned_lines = [
line.strip() for line in lines
if line.strip() and len(line.strip()) > 5
and not any(unwanted in line for unwanted in unwanted_texts)
]
result = '\n'.join(cleaned_lines)
print(f"[DEBUG] Extracted {len(result)} characters from URL")
if not result or len(result) < 100:
raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.")
return result
def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]:
"""Parse document IDs from various input formats."""
if doc_ids is None:
return []
if isinstance(doc_ids, list):
return [str(id).strip('[]') for id in doc_ids]
if isinstance(doc_ids, str):
cleaned = doc_ids.strip('[]').replace(' ', '')
if cleaned:
return [id.strip() for id in cleaned.split(',')]
return []
def get_links_html(doc_ids: Union[List, str, None]) -> str:
"""Generate HTML links for document IDs."""
parsed_ids = parse_doc_ids(doc_ids)
if not parsed_ids:
return ""
links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
for doc_id in parsed_ids]
return ", ".join(links)
def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]:
"""Parse legal position IDs."""
if lp_ids is None:
return []
if isinstance(lp_ids, (str, int)):
cleaned = str(lp_ids).strip('[]').replace(' ', '')
if cleaned:
return [cleaned]
return []
def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
"""Generate HTML links for legal position IDs."""
parsed_ids = parse_lp_ids(lp_ids)
if not parsed_ids:
return ""
links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/legal-position/{lp_id})"
for lp_id in parsed_ids]
return ", ".join(links)
def extract_json_from_text(text: str) -> Optional[Dict]:
"""Extract and parse JSON from text, handling markdown blocks and other noise."""
if not text:
return None
try:
# 1. Try direct parsing
return json.loads(text.strip())
except json.JSONDecodeError:
pass
# 2. Try to find JSON within markdown or other text
text_to_parse = text.strip()
# Remove markdown code blocks with triple backticks or triple single quotes
for delimiter in ["```json", "'''json", "```", "'''"]:
if delimiter in text_to_parse:
try:
parts = text_to_parse.split(delimiter)
if len(parts) > 1:
# Take the first content block after the delimiter
candidate = parts[1].split(delimiter.replace("json", ""))[0].strip()
if candidate:
text_to_parse = candidate
break
except Exception:
continue
try:
return json.loads(text_to_parse)
except json.JSONDecodeError:
pass
# 3. Last resort: find the first { and last }
# Try to balance braces to handle potential truncation or trailing noise
start_idx = text_to_parse.find('{')
if start_idx != -1:
# Step backwards from the end to find the last valid-looking closing brace
for end_idx in range(len(text_to_parse) - 1, start_idx, -1):
if text_to_parse[end_idx] == '}':
candidate = text_to_parse[start_idx:end_idx + 1]
try:
return json.loads(candidate)
except json.JSONDecodeError:
continue
return None