File size: 3,907 Bytes
1b5f7e2
dd6f902
1b5f7e2
6ecdf6a
1b5f7e2
 
dd6f902
1b5f7e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd6f902
 
 
 
1b5f7e2
 
 
 
 
dd6f902
 
 
 
 
 
 
1b5f7e2
 
 
 
 
 
 
dd6f902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b5f7e2
 
 
 
 
 
 
 
 
 
 
 
 
dd6f902
1b5f7e2
 
 
 
 
 
6ecdf6a
 
 
1b5f7e2
 
6ecdf6a
1b5f7e2
 
 
 
 
 
 
 
 
 
 
 
 
 
dd6f902
1b5f7e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd6f902
1b5f7e2
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import io
import json
import os
import re
import uuid
import wave
from pathlib import Path
from urllib.parse import quote

from dotenv import load_dotenv
from fastapi import FastAPI, File, Request, Response, UploadFile
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles

from anthropic import Anthropic
from groq import Groq
from piper import PiperVoice

from hal_prompt import HAL_SYSTEM_PROMPT

load_dotenv()

MODEL_PATH = "models/hal.onnx"
CLAUDE_MODEL = "claude-sonnet-4-6"
WHISPER_MODEL = "whisper-large-v3-turbo"
MAX_HISTORY_TURNS = 20
PROFILE_PATH = Path("profile.md")
DATA_DIR = Path(os.environ.get("HAL_DATA_DIR", "data"))
SESSIONS_DIR = DATA_DIR / "sessions"
SESSIONS_DIR.mkdir(parents=True, exist_ok=True)

print("Loading HAL voice...")
VOICE = PiperVoice.load(MODEL_PATH)
print("HAL voice loaded")

if PROFILE_PATH.exists():
    profile_text = PROFILE_PATH.read_text().strip()
    SYSTEM_PROMPT = f"{HAL_SYSTEM_PROMPT}\n\n---\n\nContext about Peter:\n\n{profile_text}"
    print(f"Loaded profile ({len(profile_text)} chars)")
else:
    SYSTEM_PROMPT = HAL_SYSTEM_PROMPT

groq_client = Groq()
anthropic_client = Anthropic()

app = FastAPI()
app.mount("/static", StaticFiles(directory="static"), name="static")


def session_file(session_id: str) -> Path:
    return SESSIONS_DIR / f"{session_id}.json"


def load_history(session_id: str) -> list[dict]:
    f = session_file(session_id)
    if f.exists():
        try:
            return json.loads(f.read_text())
        except (json.JSONDecodeError, OSError):
            return []
    return []


def save_history(session_id: str, history: list[dict]) -> None:
    tmp = session_file(session_id).with_suffix(".json.tmp")
    tmp.write_text(json.dumps(history))
    tmp.replace(session_file(session_id))


def transcribe(audio_bytes: bytes, filename: str = "audio.webm") -> str:
    result = groq_client.audio.transcriptions.create(
        file=(filename, audio_bytes),
        model=WHISPER_MODEL,
        language="en",
    )
    return result.text.strip()


def hal_respond(history: list[dict]) -> str:
    resp = anthropic_client.messages.create(
        model=CLAUDE_MODEL,
        max_tokens=300,
        system=SYSTEM_PROMPT,
        messages=history,
    )
    return resp.content[0].text.strip()


def synthesize_hal(text: str) -> bytes:
    # Piper spells all-caps acronyms letter-by-letter. Rewrite "HAL" to a
    # phonetic form so it is spoken as a word.
    spoken = re.sub(r"\bHAL\b", "Hal", text)
    buf = io.BytesIO()
    with wave.open(buf, "wb") as wav_file:
        VOICE.synthesize_wav(spoken, wav_file)
    return buf.getvalue()


@app.get("/")
def index():
    return FileResponse("static/index.html")


@app.post("/api/talk")
async def talk(request: Request, audio: UploadFile = File(...)):
    session_id = request.cookies.get("hal_session")
    new_session = session_id is None
    if new_session:
        session_id = str(uuid.uuid4())
    history = load_history(session_id)

    audio_bytes = await audio.read()
    filename = audio.filename or "audio.webm"
    user_text = transcribe(audio_bytes, filename)

    if not user_text:
        resp = Response(status_code=204)
        if new_session:
            resp.set_cookie("hal_session", session_id, httponly=True, samesite="lax")
        return resp

    history.append({"role": "user", "content": user_text})
    trimmed = history[-MAX_HISTORY_TURNS:]

    hal_text = hal_respond(trimmed)
    history.append({"role": "assistant", "content": hal_text})
    save_history(session_id, history)

    wav_bytes = synthesize_hal(hal_text)

    resp = Response(content=wav_bytes, media_type="audio/wav")
    resp.headers["X-User-Transcript"] = quote(user_text)
    resp.headers["X-Hal-Transcript"] = quote(hal_text)
    if new_session:
        resp.set_cookie("hal_session", session_id, httponly=True, samesite="lax")
    return resp