samchun-gemini / utils /context_compression.py
JHyeok5's picture
Upload folder using huggingface_hub
0f3460d verified
"""
Context Compression Utilities
AI ํ”„๋กฌํ”„ํŠธ ํ† ํฐ ์ ˆ๊ฐ์„ ์œ„ํ•œ ์••์ถ• ์œ ํ‹ธ๋ฆฌํ‹ฐ
@module context_compression
@description
- ์ŠคํŒŸ ๋ฐ์ดํ„ฐ๋ฅผ ์••์ถ• ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ํ”„๋กฌํ”„ํŠธ ํ† ํฐ ์ ˆ๊ฐ
- ๋ชฉํ‘œ: ํ”„๋กฌํ”„ํŠธ ํ† ํฐ 5,000 -> 500 (90% ๊ฐ์†Œ)
- API ๋น„์šฉ 30% ์ ˆ๊ฐ, ์‘๋‹ต ์†๋„ 20-30% ๊ฐœ์„ 
@changelog
- v1.0.0 (2026-01-25): ์ดˆ๊ธฐ ๊ตฌํ˜„
- compress_spot: ๋‹จ์ผ ์ŠคํŒŸ ์••์ถ• (JSON -> ํŒŒ์ดํ”„ ๊ตฌ๋ถ„ ๋ฌธ์ž์—ด)
- compress_spots: ์ŠคํŒŸ ๋ฆฌ์ŠคํŠธ ์••์ถ•
- decompress_course_spots: AI ์‘๋‹ต spot_id๋กœ ์›๋ณธ ์ŠคํŒŸ ๋ณต์›
- create_compression_guide: ํ”„๋กฌํ”„ํŠธ์šฉ ์••์ถ• ํ˜•์‹ ๊ฐ€์ด๋“œ
@example
Before (์•ฝ 500 ํ† ํฐ):
{
"id": "vj_123",
"name": "ํ•˜๊ท€ํฌ๊ตฌ",
"category": "ํฌ๊ตฌ",
"location": {"lat": 33.456, "lng": 126.789},
"tags": ["์—ญ์‚ฌ", "๋ฐ”๋‹ค", "์‚ฌ์ง„"],
"story_preview": "400๋…„ ์ „ ์™œ๊ตฌ์˜ ์นจ๋žต..."
}
After (์•ฝ 50 ํ† ํฐ):
vj_123|ํ•˜๊ท€ํฌ๊ตฌ|ํฌ๊ตฌ|33.4560,126.7890|์—ญ์‚ฌ,๋ฐ”๋‹ค,์‚ฌ์ง„|15
"""
from typing import List, Dict, Any, Optional
import logging
logger = logging.getLogger(__name__)
def compress_spot(spot: Dict[str, Any]) -> str:
"""
๋‹จ์ผ ์ŠคํŒŸ์„ ์••์ถ• ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
Args:
spot: ์ŠคํŒŸ ๋”•์…”๋„ˆ๋ฆฌ (id, name, category, location, tags, meta ๋“ฑ)
Returns:
์••์ถ•๋œ ๋ฌธ์ž์—ด (ํŒŒ์ดํ”„ ๊ตฌ๋ถ„)
ํ˜•์‹: spot_id|์ด๋ฆ„|์นดํ…Œ๊ณ ๋ฆฌ|์œ„๋„,๊ฒฝ๋„|ํƒœ๊ทธ1,ํƒœ๊ทธ2|์ฒด๋ฅ˜์‹œ๊ฐ„
Example:
Input: {"id": "vj_123", "name": "ํ•˜๊ท€ํฌ๊ตฌ", ...}
Output: "vj_123|ํ•˜๊ท€ํฌ๊ตฌ|ํฌ๊ตฌ|33.4560,126.7890|์—ญ์‚ฌ,๋ฐ”๋‹ค|15"
"""
# ๊ธฐ๋ณธ ํ•„๋“œ ์ถ”์ถœ
spot_id = spot.get("id", "")
name = spot.get("name", "")
category = spot.get("category", "")
# ์œ„์น˜ ์ •๋ณด ์ถ”์ถœ (์†Œ์ˆ˜์  4์ž๋ฆฌ๋กœ ์ œํ•œ)
location = spot.get("location", {})
lat = location.get("lat", 0)
lng = location.get("lng", 0)
loc_str = f"{lat:.4f},{lng:.4f}"
# ํƒœ๊ทธ (์ตœ๋Œ€ 5๊ฐœ๋กœ ์ œํ•œ)
tags = spot.get("tags", [])[:5]
tags_str = ",".join(tags) if tags else ""
# ์ฒด๋ฅ˜ ์‹œ๊ฐ„ (meta์—์„œ ์ถ”์ถœ)
meta = spot.get("meta", {})
stay_duration = meta.get("stay_duration_min", 15) if meta else 15
return f"{spot_id}|{name}|{category}|{loc_str}|{tags_str}|{stay_duration}"
def compress_spots(spots: List[Dict[str, Any]]) -> str:
"""
์ŠคํŒŸ ๋ฆฌ์ŠคํŠธ๋ฅผ ์••์ถ• ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
Args:
spots: ์ŠคํŒŸ ๋”•์…”๋„ˆ๋ฆฌ ๋ฆฌ์ŠคํŠธ
Returns:
์ค„๋ฐ”๊ฟˆ์œผ๋กœ ๊ตฌ๋ถ„๋œ ์••์ถ• ๋ฌธ์ž์—ด
Example:
vj_001|ํ•˜๊ท€ํฌ๊ตฌ|ํฌ๊ตฌ|33.4560,126.7890|์—ญ์‚ฌ,๋ฐ”๋‹ค|15
vj_002|๊ณฝ์ง€ํ•ด๋ณ€|ํ•ด๋ณ€|33.4567,126.7891|์ž์—ฐ,์‚ฌ์ง„|20
"""
compressed_lines = [compress_spot(spot) for spot in spots]
return "\n".join(compressed_lines)
def create_compression_guide() -> str:
"""
์••์ถ• ํ˜•์‹ ์„ค๋ช… (ํ”„๋กฌํ”„ํŠธ์— ํฌํ•จ)
AI๊ฐ€ ์••์ถ•๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ดํ•ดํ•  ์ˆ˜ ์žˆ๋„๋ก ํ˜•์‹ ์„ค๋ช… ์ œ๊ณต
Returns:
ํ”„๋กฌํ”„ํŠธ์— ์‚ฝ์ž…ํ•  ํ˜•์‹ ๊ฐ€์ด๋“œ ๋ฌธ์ž์—ด
"""
return """**์ŠคํŒŸ ๋ฐ์ดํ„ฐ ํ˜•์‹ (์••์ถ•)**
๊ฐ ์ค„ ํ˜•์‹: spot_id|์ด๋ฆ„|์นดํ…Œ๊ณ ๋ฆฌ|์œ„๋„,๊ฒฝ๋„|ํƒœ๊ทธ๋“ค|์ฒด๋ฅ˜์‹œ๊ฐ„(๋ถ„)
์˜ˆ์‹œ: vj_123|ํ•˜๊ท€ํฌ๊ตฌ|ํฌ๊ตฌ|33.4560,126.7890|์—ญ์‚ฌ,๋ฐ”๋‹ค|15
**์ค‘์š”**: ์‘๋‹ต์˜ spot_id๋Š” ๋ฐ˜๋“œ์‹œ ์œ„ ๋ชฉ๋ก์— ์žˆ๋Š” ID๋งŒ ์‚ฌ์šฉํ•˜์„ธ์š”.
"""
def decompress_course_spots(
compressed_spot_ids: List[str],
original_spots: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
AI๊ฐ€ ๋ฐ˜ํ™˜ํ•œ spot_id ๋ฆฌ์ŠคํŠธ๋ฅผ ์›๋ณธ ์ŠคํŒŸ ๋ฐ์ดํ„ฐ๋กœ ๋ณต์›
Args:
compressed_spot_ids: AI๊ฐ€ ๋ฐ˜ํ™˜ํ•œ spot_id ๋ฆฌ์ŠคํŠธ
original_spots: ์›๋ณธ ์ŠคํŒŸ ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ
Returns:
๋ณต์›๋œ ์ŠคํŒŸ ๋”•์…”๋„ˆ๋ฆฌ ๋ฆฌ์ŠคํŠธ
Example:
Input: ["vj_001", "vj_003", "vj_005"]
Output: [{"id": "vj_001", ...}, {"id": "vj_003", ...}, ...]
"""
# spot_id -> ์›๋ณธ ์ŠคํŒŸ ๋งคํ•‘
spot_map = {spot["id"]: spot for spot in original_spots}
decompressed = []
for spot_id in compressed_spot_ids:
if spot_id in spot_map:
decompressed.append(spot_map[spot_id])
else:
logger.warning(f"[decompress] spot_id not found: {spot_id}")
return decompressed
def calculate_compression_ratio(
original_json: str,
compressed_str: str
) -> Dict[str, Any]:
"""
์••์ถ•๋ฅ  ๊ณ„์‚ฐ ๋ฐ ํ†ต๊ณ„ ๋ฐ˜ํ™˜
Args:
original_json: ์›๋ณธ JSON ๋ฌธ์ž์—ด
compressed_str: ์••์ถ•๋œ ๋ฌธ์ž์—ด
Returns:
์••์ถ• ํ†ต๊ณ„ ๋”•์…”๋„ˆ๋ฆฌ
"""
original_len = len(original_json)
compressed_len = len(compressed_str)
# ๋Œ€๋žต์ ์ธ ํ† ํฐ ์ˆ˜ ์ถ”์ • (ํ•œ๊ธ€ ๊ธฐ์ค€ ์•ฝ 2์ž๋‹น 1ํ† ํฐ)
original_tokens_est = original_len // 2
compressed_tokens_est = compressed_len // 2
ratio = (compressed_len / original_len * 100) if original_len > 0 else 0
savings = 100 - ratio
return {
"original_chars": original_len,
"compressed_chars": compressed_len,
"original_tokens_est": original_tokens_est,
"compressed_tokens_est": compressed_tokens_est,
"ratio_percent": round(ratio, 1),
"savings_percent": round(savings, 1)
}