#!/usr/bin/env python3 """Shared AI hook logger – works on Linux, Windows CMD, and PowerShell. Features: * Accepts **two positional arguments** (`prompt` and `response`) – ideal for manual testing. * Falls back to **stdin** when data is piped (e.g., `echo '{"prompt":"..."}' | python3 scripts/log_hook.py`). * If **no input** is provided, the script exits silently – prevents the "hang" you experienced. * Writes a JSON line to `.ai-log/session.jsonl` with a minimal set of fields required by the repository. """ import json import os import sys import subprocess from datetime import datetime, timezone, timedelta from pathlib import Path # Vietnam timezone (used by existing logs) VN_TZ = timezone(timedelta(hours=7)) DEFAULT_STUDENT_EMAIL = 'akirahoang617@gmail.com' # Typical artifacts when UTF-8 bytes are decoded with a legacy codepage first. MOJIBAKE_TOKENS = ('Ã', 'Â', 'Ä', 'Å', 'Æ', 'â€', 'á»') def _mojibake_score(text: str) -> int: return sum(text.count(token) for token in MOJIBAKE_TOKENS) def _fix_mojibake_text(value: str) -> str: """Best-effort repair for common UTF-8 mojibake in Windows pipelines.""" if not isinstance(value, str) or not value: return value original_score = _mojibake_score(value) if original_score == 0: return value best = value best_score = original_score for source_encoding in ('cp1252', 'latin-1'): try: candidate = value.encode(source_encoding).decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): continue candidate_score = _mojibake_score(candidate) if candidate_score < best_score: best = candidate best_score = candidate_score return best def _read_stdin_text() -> str: """Read stdin as bytes first to avoid locale-dependent mis-decoding.""" raw_bytes = sys.stdin.buffer.read() if not raw_bytes: return '' for encoding in ('utf-8-sig', 'utf-8'): try: return raw_bytes.decode(encoding) except UnicodeDecodeError: continue # Last resort: decode with replacement to avoid crashing hooks. return raw_bytes.decode('utf-8', errors='replace') def _get_git_metadata() -> dict: """Collect git metadata using individual commands for cross-platform compatibility. Avoids shell=True and '&&' chaining which fails on PowerShell 5.1 (Win 10 default). Each git command is run as a direct subprocess call for maximum portability. """ metadata = { 'repo': Path.cwd().name, 'branch': 'unknown', 'commit': 'unknown', 'student': 'unknown' } git_commands = [ ('repo', ['git', 'remote', 'get-url', 'origin']), ('branch', ['git', 'rev-parse', '--abbrev-ref', 'HEAD']), ('commit', ['git', 'rev-parse', '--short', 'HEAD']), ('student', ['git', 'config', 'user.email']), ] for key, cmd in git_commands: try: result = subprocess.check_output( cmd, text=True, stderr=subprocess.DEVNULL).strip() if key == 'repo' and result: metadata['repo'] = result.split('/')[-1].replace('.git', '') elif result: metadata[key] = result except Exception: pass metadata['student'] = os.getenv('AI_LOG_STUDENT_EMAIL') or DEFAULT_STUDENT_EMAIL or metadata['student'] return metadata def _write_entry(entry: dict) -> None: log_dir = Path(os.getenv('AI_LOG_DIR', '.ai-log')) log_dir.mkdir(exist_ok=True) log_file = log_dir / 'session.jsonl' try: with open(log_file, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') except Exception as e: print(f"Error writing to log file: {e}") raise def _read_existing_keys() -> set[tuple[str, str, str]]: log_file = Path(os.getenv('AI_LOG_DIR', '.ai-log')) / 'session.jsonl' keys: set[tuple[str, str, str]] = set() if not log_file.exists(): return keys try: with open(log_file, 'r', encoding='utf-8') as f: for line in f: try: item = json.loads(line) except json.JSONDecodeError: continue keys.add(( str(item.get('tool', '')), str(item.get('session_id', '')), str(item.get('prompt', '')), )) except Exception: pass return keys def _extract_text_from_content(content) -> str: if isinstance(content, str): return content if isinstance(content, list): parts = [] for item in content: if isinstance(item, dict) and isinstance(item.get('text'), str): parts.append(item['text']) return '\n'.join(parts) return '' def _latest_codex_session_file() -> Path | None: root = Path(os.getenv('CODEX_HOME', Path.home() / '.codex')) / 'sessions' if not root.exists(): return None candidates = list(root.rglob('rollout-*.jsonl')) if not candidates: return None return max(candidates, key=lambda path: path.stat().st_mtime) def _extract_codex_prompt_from_session(path: Path) -> dict | None: cwd = str(Path.cwd()).lower() session_id = path.stem current_cwd = '' last_prompt = '' model = '' try: with open(path, 'r', encoding='utf-8') as f: for line in f: try: item = json.loads(line) except json.JSONDecodeError: continue payload = item.get('payload') or {} if item.get('type') == 'turn_context': current_cwd = str(payload.get('cwd', '')).lower() model = str(payload.get('model') or model) continue if current_cwd and current_cwd != cwd: continue if payload.get('type') == 'user_message': message = str(payload.get('message', '')).strip() if message: last_prompt = message continue if payload.get('type') == 'message' and payload.get('role') == 'user': message = _extract_text_from_content(payload.get('content')).strip() if message: last_prompt = message except Exception: return None if not last_prompt: return None return { 'session_id': session_id, 'model': model, 'prompt': last_prompt, } def _log_latest_codex_session() -> bool: session_file = _latest_codex_session_file() if not session_file: return False extracted = _extract_codex_prompt_from_session(session_file) if not extracted: return False prompt = _fix_mojibake_text(extracted['prompt'])[:1000] key = ('codex', extracted['session_id'], prompt) if key in _read_existing_keys(): return True meta = _get_git_metadata() entry = { 'ts': datetime.now(VN_TZ).isoformat(), 'tool': 'codex', 'event': 'codex_session_fallback', 'session_id': extracted['session_id'], 'model': extracted['model'], 'repo': meta['repo'], 'branch': meta['branch'], 'commit': meta['commit'], 'student': meta['student'], 'prompt': prompt, 'response_summary': '', } _write_entry(entry) return True def _make_entry(prompt: str, response: str, tool: str = 'manual') -> dict: meta = _get_git_metadata() return { 'ts': datetime.now(VN_TZ).isoformat(), 'tool': tool, 'event': '', 'session_id': '', 'model': '', 'repo': meta['repo'], 'branch': meta['branch'], 'commit': meta['commit'], 'student': meta['student'], 'prompt': _fix_mojibake_text(prompt)[:1000], 'response_summary': _fix_mojibake_text(response)[:500], } def main() -> None: # 1️⃣ If two positional arguments are supplied, treat them as prompt/response. # Skip this branch if the first arg looks like a flag (e.g. --tool). if len(sys.argv) == 3 and not sys.argv[1].startswith('--'): prompt, response = sys.argv[1], sys.argv[2] entry = _make_entry( prompt, response, os.getenv('AI_TOOL_NAME', 'manual')) _write_entry(entry) print(json.dumps({'status': 'logged'})) return # 2️⃣ Otherwise read from stdin (piped JSON). If nothing comes in, exit silently. parser_tool = None if '--tool' in sys.argv: idx = sys.argv.index('--tool') if idx + 1 < len(sys.argv): parser_tool = sys.argv[idx + 1] raw = _read_stdin_text().strip() if not raw: if parser_tool == 'codex': _log_latest_codex_session() sys.exit(0) raw = _fix_mojibake_text(raw) try: data = json.loads(raw) except json.JSONDecodeError: sys.exit(0) tool = parser_tool or os.getenv('AI_TOOL_NAME', 'manual') meta = _get_git_metadata() # Minimal normalisation – keep only fields we care about. entry = { 'ts': datetime.now(VN_TZ).isoformat(), 'tool': tool, 'event': data.get('hook_event_name') or data.get('event') or '', 'session_id': data.get('session_id') or data.get('conversation_id') or data.get('generation_id') or '', 'model': data.get('model', ''), 'repo': meta['repo'], 'branch': meta['branch'], 'commit': meta['commit'], 'student': meta['student'], 'prompt': _fix_mojibake_text(str(data.get('prompt', '')))[:1000], 'response_summary': _fix_mojibake_text(str(data.get('response', data.get('response_summary', ''))))[:500], } _write_entry(entry) print(json.dumps({'status': 'logged'})) if __name__ == '__main__': main()