File size: 7,527 Bytes
29ebcc1
 
 
 
 
 
 
 
 
 
229a3e3
 
 
 
 
 
29ebcc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229a3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29ebcc1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# app/storage.py
# Supabase Storage integration for persisting generated audio files.
# Uploads audio to the tts-audio public bucket and returns a public URL.
# Called as a background thread from run_synthesis β€” non-blocking.

import os
import threading
from pathlib import Path
from supabase import create_client, Client


# --- cleanup config ---
# soft limit: trigger cleanup when total audio in bucket exceeds this
_BUCKET_SIZE_LIMIT_BYTES = 800 * 1024 * 1024  # 800MB


# --- client setup ---
_client: Client | None = None

def _get_client() -> Client:
    global _client
    if _client is None:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_ANON_KEY")
        if not url or not key:
            raise ValueError(
                "SUPABASE_URL and SUPABASE_ANON_KEY must be set in .env"
            )
        _client = create_client(url, key)
    return _client


def upload_audio(local_path: str, filename: str) -> str | None:
    """
    Upload an audio file to Supabase tts-audio bucket.
    Returns the public URL on success, None on failure.

    Args:
        local_path: full path to local audio file
        filename:   destination filename in bucket (e.g. '2026-04-14_kokoro_K-2.wav')
    """
    try:
        client = _get_client()
        with open(local_path, "rb") as f:
            data = f.read()

        # detect content type
        ext = Path(local_path).suffix.lower()
        content_type = "audio/mpeg" if ext == ".mp3" else "audio/wav"

        client.storage.from_("tts-audio").upload(
            path=filename,
            file=data,
            file_options={"content-type": content_type, "upsert": "true"},
        )

        # build public URL
        result = client.storage.from_("tts-audio").get_public_url(filename)
        return result

    except Exception as e:
        print(f"[Storage] Upload failed for {filename}: {e}")
        return None


def upload_audio_background(local_path: str, filename: str, callback=None) -> None:
    """
    Upload audio in a background thread β€” non-blocking.
    Optionally calls callback(url) when done, where url is None on failure.

    Args:
        local_path: full path to local audio file
        filename:   destination filename in bucket
        callback:   optional function(url: str | None) called after upload
    """
    def _run():
        url = upload_audio(local_path, filename)
        if callback:
            callback(url)

    thread = threading.Thread(target=_run, daemon=True)
    thread.start()

def upload_csv(local_path: str) -> bool:
    """
    Upload eval_log.csv to Supabase tts-audio bucket.
    Uses upsert so it overwrites the existing file.
    Returns True on success, False on failure.
    """
    try:
        client = _get_client()
        with open(local_path, "rb") as f:
            data = f.read()

        client.storage.from_("tts-audio").upload(
            path="eval_log.csv",
            file=data,
            file_options={"content-type": "text/csv", "upsert": "true"},
        )
        print("[Storage] eval_log.csv uploaded to Supabase")
        return True

    except Exception as e:
        print(f"[Storage] CSV upload failed: {e}")
        return False


def download_csv(local_path: str) -> bool:
    """
    Download eval_log.csv from Supabase tts-audio bucket to local path.
    Returns True on success, False on failure.
    """
    try:
        client = _get_client()
        response = client.storage.from_("tts-audio").download("eval_log.csv")

        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        with open(local_path, "wb") as f:
            f.write(response)

        print("[Storage] eval_log.csv downloaded from Supabase")
        return True

    except Exception as e:
        print(f"[Storage] CSV download failed (will use local fallback): {e}")
        return False


def upload_csv_background(local_path: str) -> None:
    """Upload CSV in background thread β€” non-blocking."""
    thread = threading.Thread(target=upload_csv, args=(local_path,), daemon=True)
    thread.start()


def cleanup_bucket_if_needed(csv_local_path: str) -> None:
    """
    Check total size of audio files in tts-audio bucket.
    If over _BUCKET_SIZE_LIMIT_BYTES, delete oldest files by filename
    timestamp until back under limit. Removes corresponding rows from
    local CSV and re-uploads it to Supabase.
    Skips eval_log.csv when calculating size and deleting.
    """
    try:
        client = _get_client()

        # list all files in bucket
        files = client.storage.from_("tts-audio").list()
        if not files:
            return

        # filter out CSV β€” only count audio files
        audio_files = [f for f in files if f["name"] != "eval_log.csv"]

        # calculate total size
        total_bytes = sum(f.get("metadata", {}).get("size", 0) for f in audio_files)

        if total_bytes <= _BUCKET_SIZE_LIMIT_BYTES:
            return

        print(f"[Storage] Cleanup triggered: {total_bytes / 1024 / 1024:.1f}MB exceeds {_BUCKET_SIZE_LIMIT_BYTES / 1024 / 1024:.0f}MB limit")

        # sort by filename (timestamp prefix ensures chronological order)
        audio_files.sort(key=lambda f: f["name"])

        # delete oldest files until under limit
        freed_bytes = 0
        deleted_names = []

        for f in audio_files:
            if total_bytes - freed_bytes <= _BUCKET_SIZE_LIMIT_BYTES:
                break
            name = f["name"]
            size = f.get("metadata", {}).get("size", 0)
            try:
                client.storage.from_("tts-audio").remove([name])
                freed_bytes += size
                deleted_names.append(name)
                print(f"[Storage] Cleanup: deleted {name} ({size / 1024 / 1024:.2f}MB)")
            except Exception as e:
                print(f"[Storage] Cleanup: failed to delete {name}: {e}")

        print(f"[Storage] Cleanup: deleted {len(deleted_names)} files, freed {freed_bytes / 1024 / 1024:.1f}MB")

        if not deleted_names:
            return

        # remove corresponding rows from CSV
        try:
            import pandas as pd
            if not os.path.exists(csv_local_path):
                return

            df = pd.read_csv(csv_local_path, dtype={"audio_url": str})

            # build set of deleted URLs for fast lookup
            deleted_urls = set()
            for name in deleted_names:
                # reconstruct public URL pattern to match against csv
                url_fragment = f"tts-audio/{name}"
                deleted_urls.add(url_fragment)

            # drop rows whose audio_url contains a deleted filename
            original_len = len(df)
            df = df[~df["audio_url"].apply(
                lambda url: any(d in str(url) for d in deleted_urls)
            )]

            rows_removed = original_len - len(df)
            df.to_csv(csv_local_path, index=False)
            print(f"[Storage] Cleanup: removed {rows_removed} rows from CSV")

            # re-upload cleaned CSV
            upload_csv(csv_local_path)

        except Exception as e:
            print(f"[Storage] Cleanup: CSV update failed: {e}")

    except Exception as e:
        print(f"[Storage] Cleanup check failed: {e}")


def cleanup_bucket_background(csv_local_path: str) -> None:
    """Run bucket cleanup in background thread β€” non-blocking."""
    thread = threading.Thread(
        target=cleanup_bucket_if_needed,
        args=(csv_local_path,),
        daemon=True,
    )
    thread.start()