Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Glossarion / glossary_compressor.py

Shirochi

Upload 93 files

ec038f4 verified about 2 months ago

raw

history blame contribute delete

8.47 kB

	# -- coding: utf-8 --
	"""
	Glossary Compressor Module
	Filters glossary entries based on source text to reduce token usage
	"""

	import os
	import re
	import json
	import csv
	from io import StringIO


	def compress_glossary(glossary_content, source_text, glossary_format='auto'):
	"""
	Compress glossary by excluding entries that don't appear in the source text.

	Args:
	glossary_content: Raw glossary content (CSV string or JSON dict/list)
	source_text: The source text to check against
	glossary_format: 'csv', 'json', or 'auto' (detect from content)

	Returns:
	Compressed glossary in the same format as input
	"""
	if not glossary_content or not source_text:
	return glossary_content

	# Auto-detect format
	if glossary_format == 'auto':
	if isinstance(glossary_content, str):
	# Check if it looks like JSON
	stripped = glossary_content.strip()
	if (stripped.startswith('{') or stripped.startswith('[')) and (stripped.endswith('}') or stripped.endswith(']')):
	glossary_format = 'json'
	else:
	glossary_format = 'csv'
	elif isinstance(glossary_content, (dict, list)):
	glossary_format = 'json'
	else:
	return glossary_content

	if glossary_format == 'csv':
	return _compress_csv_glossary(glossary_content, source_text)
	elif glossary_format == 'json':
	return _compress_json_glossary(glossary_content, source_text)
	else:
	return glossary_content


	def _compress_csv_glossary(csv_content, source_text):
	"""
	Compress CSV glossary by excluding entries not found in source text.
	Handles both legacy CSV format and token-efficient format.
	"""
	if not isinstance(csv_content, str):
	return csv_content

	lines = csv_content.strip().split('\n')
	if not lines:
	return csv_content

	# Check if this is token-efficient format (has section headers like "=== CHARACTERS ===")
	is_token_efficient = any(line.strip().startswith('===') for line in lines)

	if is_token_efficient:
	return _compress_token_efficient_format(lines, source_text)
	else:
	return _compress_legacy_csv_format(lines, source_text)


	def _compress_token_efficient_format(lines, source_text):
	"""Compress token-efficient glossary format with section headers."""
	filtered_lines = []
	current_section = None

	for line in lines:
	stripped = line.strip()

	# Keep glossary header
	if stripped.lower().startswith('glossary:'):
	filtered_lines.append(line)
	continue

	# Track section headers
	if stripped.startswith('==='):
	current_section = line
	continue

	# Process entry lines (start with "* ")
	if stripped.startswith('* '):
	# Extract the raw name from the entry
	# Format: * TranslatedName (RawName) [Gender]
	match = re.search(r'\(([^)]+)\)', stripped)
	if match:
	raw_name = match.group(1).strip()
	# Check if raw name appears in source text
	if _text_contains_term(source_text, raw_name):
	# Add section header if this is the first entry in section
	if current_section:
	filtered_lines.append(current_section)
	current_section = None
	filtered_lines.append(line)
	elif not stripped:
	# Keep blank lines
	filtered_lines.append(line)

	return '\n'.join(filtered_lines)


	def _compress_legacy_csv_format(lines, source_text):
	"""Compress legacy CSV format with type,raw_name,translated_name columns."""
	if not lines:
	return ''

	# Check if first line is a header
	first_line = lines[0].strip().lower()
	has_header = first_line.startswith('type,') or 'raw_name' in first_line

	filtered_lines = []

	# Keep header if present
	if has_header:
	filtered_lines.append(lines[0])
	data_lines = lines[1:]
	else:
	data_lines = lines

	# Process each CSV row
	for line in data_lines:
	if not line.strip():
	continue

	try:
	# Parse CSV line
	parts = list(csv.reader(StringIO(line)))[0]
	if len(parts) >= 3:
	entry_type = parts[0].strip()
	raw_name = parts[1].strip()
	translated_name = parts[2].strip()

	# Check if raw name appears in source text
	if _text_contains_term(source_text, raw_name):
	filtered_lines.append(line)
	except Exception:
	# If parsing fails, keep the line to be safe
	filtered_lines.append(line)

	return '\n'.join(filtered_lines)


	def _compress_json_glossary(json_data, source_text):
	"""
	Compress JSON glossary by excluding entries not found in source text.
	Handles both dict format and list format.
	"""
	if isinstance(json_data, str):
	try:
	json_data = json.loads(json_data)
	except json.JSONDecodeError:
	return json_data

	if isinstance(json_data, dict):
	# Handle dict with 'entries' key
	if 'entries' in json_data:
	filtered_entries = {}
	for key, value in json_data['entries'].items():
	if _text_contains_term(source_text, key):
	filtered_entries[key] = value

	result = json_data.copy()
	result['entries'] = filtered_entries
	return result
	else:
	# Simple dict format
	filtered_dict = {}
	for key, value in json_data.items():
	if key == 'metadata':
	filtered_dict[key] = value
	elif _text_contains_term(source_text, key):
	filtered_dict[key] = value
	return filtered_dict

	elif isinstance(json_data, list):
	# List of entry objects
	filtered_list = []
	for entry in json_data:
	if isinstance(entry, dict):
	# Check various possible keys for the raw term
	raw_term = entry.get('raw_name') or entry.get('original_name') or entry.get('original') or ''
	if raw_term and _text_contains_term(source_text, raw_term):
	filtered_list.append(entry)
	return filtered_list

	return json_data


	def _text_contains_term(text, term):
	"""
	Check if term appears in text using simple substring matching.
	Works well with Korean/CJK text where word boundaries are not clear.
	"""
	if not term or not text:
	return False

	# For CJK languages (Korean, Chinese, Japanese), simple substring matching works best
	# Word boundaries don't apply the same way as in English
	return term in text


	def compress_glossary_file(glossary_path, source_text):
	"""
	Load, compress, and return glossary from file path.

	Args:
	glossary_path: Path to glossary file (.csv or .json)
	source_text: The source text to check against

	Returns:
	Compressed glossary content in appropriate format
	"""
	if not glossary_path or not os.path.exists(glossary_path):
	return None

	try:
	with open(glossary_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Determine format from file extension
	if glossary_path.lower().endswith('.csv'):
	return compress_glossary(content, source_text, glossary_format='csv')
	elif glossary_path.lower().endswith('.json'):
	json_data = json.loads(content)
	compressed_data = compress_glossary(json_data, source_text, glossary_format='json')
	# Return as JSON string
	return json.dumps(compressed_data, ensure_ascii=False, indent=2)
	else:
	return content
	except Exception as e:
	print(f"⚠️ Failed to compress glossary: {e}")
	return None