LP_2-test / utils.py
DocUA's picture
Increase max_tokens for generation settings and enhance JSON extraction to handle various markdown formats
db52560
raw
history blame
7.34 kB
import unicodedata
import json
import requests
import re
from bs4 import BeautifulSoup
from typing import Union, List, Dict, Optional
def clean_text(text: str) -> str:
"""Clean text from problematic characters."""
if not text:
return text
replacements = {
''': "'", '`': "'", '´': "'", ''': "'", '"': '"', '"': '"',
'–': '-', '—': '-', '…': '...',
'\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"',
'\u2013': '-', '\u2014': '-', '\u2026': '...',
'\xa0': ' ', '\u0027': "'", '\u02BC': "'", '\u02B9': "'",
'\u0301': "", '\u0060': "'", '\u00B4': "'"
}
try:
text = unicodedata.normalize('NFKD', text)
for old, new in replacements.items():
text = text.replace(old, new)
text = ' '.join(text.split())
text = ''.join(char for char in text
if not unicodedata.category(char).startswith('C'))
return text
except Exception as e:
print(f"Error in clean_text: {str(e)}")
return text
def extract_court_decision_text(url: str) -> str:
"""Extract text from court decision URL - специфічно для reyestr.court.gov.ua."""
try:
# Add headers and timeout for better reliability
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
except requests.RequestException as e:
raise Exception(f"Помилка при завантаженні URL: {str(e)}")
soup = BeautifulSoup(response.content, 'html.parser')
unwanted_texts = [
"Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
"З метою упередження перешкоджанню стабільній роботі Реєстру"
]
result = ""
# Strategy 1: Look for textarea with id="txtdepository" (reyestr.court.gov.ua specific)
txtdepository = soup.find('textarea', id='txtdepository')
if txtdepository:
# The textarea contains HTML content as text
embedded_html = txtdepository.get_text()
# Parse the embedded HTML
embedded_soup = BeautifulSoup(embedded_html, 'html.parser')
# Extract text from paragraphs
paragraphs = []
for p in embedded_soup.find_all('p'):
p_text = p.get_text(separator=" ").strip()
# Replace   with spaces
p_text = p_text.replace('\xa0', ' ').replace(' ', ' ')
if p_text and len(p_text) > 10: # Skip very short paragraphs
paragraphs.append(p_text)
if paragraphs:
result = "\n\n".join(paragraphs)
# Strategy 2: Try to find paragraphs directly (fallback)
if not result or len(result) < 100:
decision_text = []
for paragraph in soup.find_all('p'):
text = paragraph.get_text(separator="\n").strip()
if not any(unwanted_text in text for unwanted_text in unwanted_texts):
decision_text.append(text)
result = "\n".join(decision_text).strip()
# Strategy 3: If still nothing, try wordwrap div
if not result or len(result) < 100:
wordwrap = soup.find('div', class_='wordwrap')
if wordwrap:
result = wordwrap.get_text(separator="\n").strip()
# Clean up the result
if result:
lines = result.split('\n')
cleaned_lines = [
line.strip() for line in lines
if line.strip() and len(line.strip()) > 5
and not any(unwanted in line for unwanted in unwanted_texts)
]
result = '\n'.join(cleaned_lines)
print(f"[DEBUG] Extracted {len(result)} characters from URL")
if not result or len(result) < 100:
raise Exception("Не вдалося витягти текст судового рішення з URL. Можливо, сторінка використовує JavaScript або структура змінилася.")
return result
def parse_doc_ids(doc_ids: Union[List, str, None]) -> List[str]:
"""Parse document IDs from various input formats."""
if doc_ids is None:
return []
if isinstance(doc_ids, list):
return [str(id).strip('[]') for id in doc_ids]
if isinstance(doc_ids, str):
cleaned = doc_ids.strip('[]').replace(' ', '')
if cleaned:
return [id.strip() for id in cleaned.split(',')]
return []
def get_links_html(doc_ids: Union[List, str, None]) -> str:
"""Generate HTML links for document IDs."""
parsed_ids = parse_doc_ids(doc_ids)
if not parsed_ids:
return ""
links = [f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
for doc_id in parsed_ids]
return ", ".join(links)
def parse_lp_ids(lp_ids: Union[str, int, None]) -> List[str]:
"""Parse legal position IDs."""
if lp_ids is None:
return []
if isinstance(lp_ids, (str, int)):
cleaned = str(lp_ids).strip('[]').replace(' ', '')
if cleaned:
return [cleaned]
return []
def get_links_html_lp(lp_ids: Union[str, int, None]) -> str:
"""Generate HTML links for legal position IDs."""
parsed_ids = parse_lp_ids(lp_ids)
if not parsed_ids:
return ""
links = [f"[ПП ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
for lp_id in parsed_ids]
return ", ".join(links)
def extract_json_from_text(text: str) -> Optional[Dict]:
"""Extract and parse JSON from text, handling markdown blocks and other noise."""
if not text:
return None
try:
# 1. Try direct parsing
return json.loads(text.strip())
except json.JSONDecodeError:
pass
# 2. Try to find JSON within markdown or other text
text_to_parse = text.strip()
# Remove markdown code blocks with triple backticks or triple single quotes
for delimiter in ["```json", "'''json", "```", "'''"]:
if delimiter in text_to_parse:
try:
parts = text_to_parse.split(delimiter)
if len(parts) > 1:
# Take the first content block after the delimiter
candidate = parts[1].split(delimiter.replace("json", ""))[0].strip()
if candidate:
text_to_parse = candidate
break
except Exception:
continue
try:
return json.loads(text_to_parse)
except json.JSONDecodeError:
pass
# 3. Last resort: find the first { and last }
# Try to balance braces to handle potential truncation or trailing noise
start_idx = text_to_parse.find('{')
if start_idx != -1:
# Step backwards from the end to find the last valid-looking closing brace
for end_idx in range(len(text_to_parse) - 1, start_idx, -1):
if text_to_parse[end_idx] == '}':
candidate = text_to_parse[start_idx:end_idx + 1]
try:
return json.loads(candidate)
except json.JSONDecodeError:
continue
return None