Spaces:

91prince
/

SEGAN

Paused

App Files Files Community

91prince commited on Dec 11, 2025

Commit

7eadad0

1 Parent(s): 698dbb3

Add SEAGAN model code, pipeline, and large checkpoint file

Browse files

Files changed (6) hide show

README.md +79 -0
SEGAN.py +497 -0
app.py +147 -0
checkpoints/seagan_final.pt +3 -0
pipeline.py +378 -0
requirements.txt +7 -0

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+SEAGAN Speech Enhancement & API
+===============================
+A minimal speech-denoising project built around a SEGAN-style U-Net generator. It includes:
+- Training script to learn on paired noisy/clean audio.
+- Inference pipeline that denoises long clips in chunks and can pack output audio losslessly into PNG.
+- FastAPI service to expose denoise + PNG pack/restore endpoints.
+Repo Contents
+-------------
+- `SEGAN.py` – training components: config, dataset, U-Net generator, PatchGAN discriminator, training loop.
+- `pipeline.py` – inference utilities: chunked denoiser, spectral gating cleanup, PNG pack/restore helpers.
+- `app.py` – FastAPI app wiring the pipeline for HTTP use.
+- `seagan_final.pt` – example checkpoint (place your own if different).
+- `requirements.txt` – Python dependencies.
+Prerequisites
+-------------
+- Python 3.9+ (tested with PyTorch CPU/GPU builds).
+- For GPU inference/training, install the matching CUDA-enabled `torch`/`torchaudio`.
+- FFmpeg is not required; `torchaudio` handles WAV I/O.
+Install
+-------
+```bash
+python -m venv .venv
+source .venv/Scripts/activate  # on Windows PowerShell: .\.venv\Scripts\activate
+pip install -r requirements.txt
+```
+If you need a specific CUDA wheel, install torch/torchaudio first, then run `pip install -r requirements.txt` with `--no-deps`.
+Quick Inference (CLI)
+---------------------
+Use the chunked denoiser directly:
+```bash
+python pipeline.py --input path/to/noisy.wav --output path/to/denoised.wav --checkpoint seagan_final.pt
+```
+Notes:
+- `--png-width` controls width when packing to PNG; omit `--no-pack` to also write `*_packed.png` and a reconstructed WAV check.
+- The denoiser mirrors/overlaps chunks to reduce seams and optionally runs a spectral subtraction cleanup.
+FastAPI Service
+---------------
+Environment variables:
+- `CHECKPOINT_PATH` (default `/app/checkpoints/seagan_final.pt`)
+- `CHECKPOINT_URL` (optional download at startup)
+- `SAMPLE_RATE` (default `16000`)
+- `PNG_WIDTH` (default `2048`)
+Run locally:
+```bash
+uvicorn app:app --host 0.0.0.0 --port 8000
+```
+Endpoints:
+- `POST /denoise-and-pack` – form-data key `file` with WAV. Returns packed PNG of denoised audio.
+- `POST /restore-from-png` – form-data key `file` with packed PNG. Returns restored WAV.
+- `GET /health` – health check.
+Model Training
+--------------
+`SEGAN.py` trains on paired noisy/clean WAVs. Update `Config.noisy_dir`, `Config.clean_dir`, and `Config.save_dir` to your paths, then run:
+```bash
+python SEGAN.py
+```
+Checkpoints are written every 5 epochs and as `seagan_final.pt` at the end. The inference pipeline expects a `G_state` entry inside the checkpoint.
+PNG Packing/Restoration Utilities
+---------------------------------
+`pipeline.py` exposes:
+- `save_audio_as_png_lossless(tensor, png_path, width)` – stores int16 PCM in a lossless PNG.
+- `load_audio_from_png_lossless(png_path, original_length)` – restores the tensor.
+- `write_wav_from_tensor(tensor, out_wav_path, sr)` – writes mono WAV.
+Tips
+----
+- Keep input WAVs mono or they will be averaged to mono.
+- Large files are chunked; adjust `chunk_seconds` and `overlap` in `denoise_chunked_final`.
+- Ensure the checkpoint matches the model architecture in `SEGAN.py`.

SEGAN.py ADDED Viewed

	@@ -0,0 +1,497 @@

+#!/usr/bin/env python3
+"""
+SEAGAN-style Speech Enhancement (Noise Removal) Training Script
+- Generator: U-Net on log-magnitude spectrograms
+- Discriminator: PatchGAN-style conditional (noisy + clean/enhanced)
+- Loss: L1 (reconstruction) + adversarial (LSGAN)
+Requirements:
+    pip install torch torchaudio numpy
+"""
+import os
+import glob
+import random
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+import torchaudio
+# ==========================
+#       CONFIG
+# ==========================
+class Config:
+    # Paths (CHANGE THESE TO YOUR FOLDERS)
+    noisy_dir = r"E:\Minor-Project-For-Amity-Patna\Models\Audio Data\Noisy Data"   # noisy wavs
+    clean_dir = r"E:\Minor-Project-For-Amity-Patna\Models\Audio Data\Noiseless Data"   # clean wavs
+    save_dir = r"E:\Minor-Project-For-Amity-Patna\Model SEGAN\checkpoints_seagan"
+    # Audio
+    sample_rate = 16000
+    segment_seconds = 1.0       # train on 1-second chunks
+    mono = True
+    # STFT / Spectrogram
+    n_fft = 512
+    hop_length = 128
+    win_length = 512
+    # Training
+    batch_size = 8
+    num_workers = 2
+    num_epochs = 50
+    lr_g = 2e-4
+    lr_d = 2e-4
+    beta1 = 0.5
+    beta2 = 0.999
+    lambda_l1 = 100.0  # weight for L1 loss vs GAN loss (like pix2pix)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+cfg = Config()
+# ==========================
+#       DATASET
+# ==========================
+def list_wav_pairs(noisy_dir: str, clean_dir: str) -> List[Tuple[str, str]]:
+    noisy_files = sorted(glob.glob(os.path.join(noisy_dir, "*.wav")))
+    pairs = []
+    for nf in noisy_files:
+        name = os.path.basename(nf)
+        cf = os.path.join(clean_dir, name)
+        if os.path.exists(cf):
+            pairs.append((nf, cf))
+    return pairs
+class SEAGANDataset(Dataset):
+    def __init__(
+        self,
+        noisy_dir: str,
+        clean_dir: str,
+        sample_rate: int = 16000,
+        segment_seconds: float = 1.0,
+    ):
+        self.sample_rate = sample_rate
+        self.segment_samples = int(segment_seconds * sample_rate)
+        self.pairs = list_wav_pairs(noisy_dir, clean_dir)
+        if len(self.pairs) == 0:
+            raise RuntimeError("No paired .wav files found! Check your folders & names.")
+        self.resampler_cache = {}
+    def __len__(self):
+        return len(self.pairs)
+    def _get_resampler(self, orig_sr: int):
+        if orig_sr == self.sample_rate:
+            return None
+        if orig_sr not in self.resampler_cache:
+            self.resampler_cache[orig_sr] = torchaudio.transforms.Resample(
+                orig_freq=orig_sr, new_freq=self.sample_rate
+            )
+        return self.resampler_cache[orig_sr]
+    def _load_audio(self, path: str) -> torch.Tensor:
+        wav, sr = torchaudio.load(path)  # shape: (channels, samples)
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)  # mono
+        resampler = self._get_resampler(sr)
+        if resampler is not None:
+            wav = resampler(wav)
+        return wav  # (1, samples)
+    def _aligned_random_crop(self, noisy: torch.Tensor, clean: torch.Tensor):
+        """
+        Crop noisy and clean with the same start index for alignment.
+        noisy, clean: (1, T)
+        """
+        T = min(noisy.shape[1], clean.shape[1])
+        noisy = noisy[:, :T]
+        clean = clean[:, :T]
+        if T <= self.segment_samples:
+            pad = self.segment_samples - T
+            noisy = torch.nn.functional.pad(noisy, (0, pad))
+            clean = torch.nn.functional.pad(clean, (0, pad))
+            return noisy, clean
+        else:
+            start = random.randint(0, T - self.segment_samples)
+            end = start + self.segment_samples
+            return noisy[:, start:end], clean[:, start:end]
+    def __getitem__(self, idx: int):
+        noisy_path, clean_path = self.pairs[idx]
+        noisy = self._load_audio(noisy_path)
+        clean = self._load_audio(clean_path)
+        noisy, clean = self._aligned_random_crop(noisy, clean)
+        return noisy, clean
+# ==========================
+#   SPECTROGRAM HELPERS
+# ==========================
+class STFTMagTransform(nn.Module):
+    """
+    Convert waveform -> log-magnitude spectrogram
+    """
+    def __init__(self, n_fft, hop_length, win_length):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        # register window so it moves with .to(device)
+        self.register_buffer("window", torch.hann_window(win_length))
+    def forward(self, wav: torch.Tensor) -> torch.Tensor:
+        """
+        wav: (B, 1, T)
+        return: (B, 1, F, T_spec)
+        """
+        B, C, T = wav.shape
+        wav = wav.view(B * C, T)
+        spec = torch.stft(
+            wav,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            return_complex=True,
+        )
+        mag = torch.abs(spec)  # (B*C, F, T_spec)
+        log_mag = torch.log1p(mag)  # log(1 + mag)
+        log_mag = log_mag.view(B, C, log_mag.shape[1], log_mag.shape[2])
+        return log_mag
+# ==========================
+#   SIZE MATCH HELPER
+# ==========================
+def match_size(a: torch.Tensor, b: torch.Tensor):
+    """
+    Crop a and b to have the same (H, W). Keeps the top-left region.
+    a, b: (..., H, W)
+    returns: (a_crop, b_crop)
+    """
+    Ha, Wa = a.shape[-2], a.shape[-1]
+    Hb, Wb = b.shape[-2], b.shape[-1]
+    H = min(Ha, Hb)
+    W = min(Wa, Wb)
+    a_c = a[..., :H, :W]
+    b_c = b[..., :H, :W]
+    return a_c, b_c
+# ==========================
+#      GENERATOR (U-NET)
+# ==========================
+class ConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, down=True, use_bn=True):
+        super().__init__()
+        if down:
+            layers = [
+                nn.Conv2d(in_ch, out_ch, kernel_size=4, stride=2, padding=1),
+                nn.LeakyReLU(0.2, inplace=True),
+            ]
+        else:
+            layers = [
+                nn.ConvTranspose2d(in_ch, out_ch, kernel_size=4, stride=2, padding=1),
+                nn.ReLU(inplace=True),
+            ]
+        if use_bn:
+            layers.insert(1, nn.BatchNorm2d(out_ch))
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+class UNetGenerator(nn.Module):
+    """
+    U-Net operating on (B, 1, F, T) log-magnitude spectrograms
+    """
+    def __init__(self, in_ch=1, out_ch=1, base_ch=64):
+        super().__init__()
+        # Encoder
+        self.down1 = ConvBlock(in_ch, base_ch, down=True, use_bn=False)  # (64)
+        self.down2 = ConvBlock(base_ch, base_ch * 2)
+        self.down3 = ConvBlock(base_ch * 2, base_ch * 4)
+        self.down4 = ConvBlock(base_ch * 4, base_ch * 8)
+        self.down5 = ConvBlock(base_ch * 8, base_ch * 8)
+        # Bottleneck
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(base_ch * 8, base_ch * 8, kernel_size=4, stride=2, padding=1),
+            nn.ReLU(inplace=True),
+        )
+        # Decoder
+        self.up1 = ConvBlock(base_ch * 8, base_ch * 8, down=False)
+        self.up2 = ConvBlock(base_ch * 8 * 2, base_ch * 8, down=False)
+        self.up3 = ConvBlock(base_ch * 8 * 2, base_ch * 4, down=False)
+        self.up4 = ConvBlock(base_ch * 4 * 2, base_ch * 2, down=False)
+        self.up5 = ConvBlock(base_ch * 2 * 2, base_ch, down=False)
+        self.final = nn.ConvTranspose2d(
+            base_ch * 2, out_ch, kernel_size=4, stride=2, padding=1
+        )
+        # Output non-negative log-magnitude
+        self.out_act = nn.ReLU()
+    def _crop_to(self, src: torch.Tensor, tgt: torch.Tensor) -> torch.Tensor:
+        """
+        Center-crop src to have the same H, W as tgt.
+        src: (B, C, Hs, Ws)
+        tgt: (B, C, Ht, Wt) (only Ht, Wt are used)
+        """
+        _, _, Hs, Ws = src.shape
+        _, _, Ht, Wt = tgt.shape
+        if Hs == Ht and Ws == Wt:
+            return src
+        start_h = max((Hs - Ht) // 2, 0)
+        start_w = max((Ws - Wt) // 2, 0)
+        end_h = start_h + Ht
+        end_w = start_w + Wt
+        return src[:, :, start_h:end_h, start_w:end_w]
+    def forward(self, x):
+        # encoder
+        d1 = self.down1(x)  # B,64
+        d2 = self.down2(d1) # B,128
+        d3 = self.down3(d2) # B,256
+        d4 = self.down4(d3) # B,512
+        d5 = self.down5(d4) # B,512
+        bott = self.bottleneck(d5)
+        # decoder with crops + skips
+        u1 = self.up1(bott)
+        d5_c = self._crop_to(d5, u1)
+        u1 = torch.cat([u1, d5_c], dim=1)
+        u2 = self.up2(u1)
+        d4_c = self._crop_to(d4, u2)
+        u2 = torch.cat([u2, d4_c], dim=1)
+        u3 = self.up3(u2)
+        d3_c = self._crop_to(d3, u3)
+        u3 = torch.cat([u3, d3_c], dim=1)
+        u4 = self.up4(u3)
+        d2_c = self._crop_to(d2, u4)
+        u4 = torch.cat([u4, d2_c], dim=1)
+        u5 = self.up5(u4)
+        d1_c = self._crop_to(d1, u5)
+        u5 = torch.cat([u5, d1_c], dim=1)
+        out = self.final(u5)
+        out = self.out_act(out)  # non-negative log-magnitude
+        return out
+# ==========================
+#   DISCRIMINATOR (PatchGAN)
+# ==========================
+class PatchDiscriminator(nn.Module):
+    """
+    Conditional discriminator: input = concat(noisy_spec, clean_or_fake_spec)
+    """
+    def __init__(self, in_ch=2, base_ch=64):
+        super().__init__()
+        # no batchnorm in first layer
+        self.model = nn.Sequential(
+            nn.Conv2d(in_ch, base_ch, kernel_size=4, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(base_ch, base_ch * 2, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(base_ch * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(base_ch * 2, base_ch * 4, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(base_ch * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(base_ch * 4, base_ch * 8, kernel_size=4, stride=1, padding=1),
+            nn.BatchNorm2d(base_ch * 8),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(base_ch * 8, 1, kernel_size=4, stride=1, padding=1),
+            # no activation -> LSGAN
+        )
+    def forward(self, x):
+        return self.model(x)  # (B, 1, H', W')
+# ==========================
+#         TRAINING
+# ==========================
+def save_checkpoint(epoch, G, D, opt_g, opt_d, path):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    torch.save(
+        {
+            "epoch": epoch,
+            "G_state": G.state_dict(),
+            "D_state": D.state_dict(),
+            "opt_g_state": opt_g.state_dict(),
+            "opt_d_state": opt_d.state_dict(),
+        },
+        path,
+    )
+    print(f"Saved checkpoint: {path}")
+def train():
+    device = cfg.device
+    print(f"Using device: {device}")
+    dataset = SEAGANDataset(
+        cfg.noisy_dir, cfg.clean_dir, cfg.sample_rate, cfg.segment_seconds
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=cfg.batch_size,
+        shuffle=True,
+        num_workers=cfg.num_workers,
+        drop_last=True,
+    )
+    stft_transform = STFTMagTransform(
+        cfg.n_fft, cfg.hop_length, cfg.win_length
+    ).to(device)
+    G = UNetGenerator(in_ch=1, out_ch=1).to(device)
+    D = PatchDiscriminator(in_ch=2).to(device)
+    # LSGAN loss
+    criterion_gan = nn.MSELoss()
+    criterion_l1 = nn.L1Loss()
+    opt_g = optim.Adam(G.parameters(), lr=cfg.lr_g, betas=(cfg.beta1, cfg.beta2))
+    opt_d = optim.Adam(D.parameters(), lr=cfg.lr_d, betas=(cfg.beta1, cfg.beta2))
+    for epoch in range(1, cfg.num_epochs + 1):
+        G.train()
+        D.train()
+        running_g_loss = 0.0
+        running_d_loss = 0.0
+        for i, (noisy_wav, clean_wav) in enumerate(loader):
+            noisy_wav = noisy_wav.to(device)  # (B,1,T)
+            clean_wav = clean_wav.to(device)  # (B,1,T)
+            # -------------------------
+            #   Waveform -> Spectrogram
+            # -------------------------
+            noisy_spec = stft_transform(noisy_wav)  # (B,1,F,T_spec)
+            clean_spec = stft_transform(clean_wav)  # (B,1,F,T_spec)
+            # Ensure same size for real pair
+            noisy_spec, clean_spec = match_size(noisy_spec, clean_spec)
+            # =========================
+            #   Train Discriminator
+            # =========================
+            opt_d.zero_grad()
+            # Real pair: (noisy, clean)
+            real_input = torch.cat([noisy_spec, clean_spec], dim=1)
+            pred_real = D(real_input)
+            target_real = torch.ones_like(pred_real)
+            loss_d_real = criterion_gan(pred_real, target_real)
+            # Fake pair: (noisy, enhanced)
+            with torch.no_grad():
+                fake_spec = G(noisy_spec)
+            # match noisy and fake sizes
+            noisy_for_fake_d, fake_spec_d = match_size(noisy_spec, fake_spec)
+            fake_input = torch.cat([noisy_for_fake_d, fake_spec_d], dim=1)
+            pred_fake = D(fake_input)
+            target_fake = torch.zeros_like(pred_fake)
+            loss_d_fake = criterion_gan(pred_fake, target_fake)
+            loss_d = 0.5 * (loss_d_real + loss_d_fake)
+            loss_d.backward()
+            opt_d.step()
+            # =========================
+            #     Train Generator
+            # =========================
+            opt_g.zero_grad()
+            fake_spec = G(noisy_spec)
+            # GAN loss (want D(noisy, fake) = 1)
+            noisy_for_fake_g, fake_spec_g = match_size(noisy_spec, fake_spec)
+            fake_input_g = torch.cat([noisy_for_fake_g, fake_spec_g], dim=1)
+            pred_fake_for_g = D(fake_input_g)
+            target_real_for_g = torch.ones_like(pred_fake_for_g)
+            loss_g_gan = criterion_gan(pred_fake_for_g, target_real_for_g)
+            # L1 reconstruction loss (match fake & clean sizes)
+            fake_l1, clean_l1 = match_size(fake_spec, clean_spec)
+            loss_g_l1 = criterion_l1(fake_l1, clean_l1) * cfg.lambda_l1
+            loss_g = loss_g_gan + loss_g_l1
+            loss_g.backward()
+            opt_g.step()
+            running_d_loss += loss_d.item()
+            running_g_loss += loss_g.item()
+            if (i + 1) % 20 == 0:
+                print(
+                    f"Epoch [{epoch}/{cfg.num_epochs}] "
+                    f"Step [{i+1}/{len(loader)}] "
+                    f"D Loss: {loss_d.item():.4f}  "
+                    f"G Loss: {loss_g.item():.4f}  "
+                    f"(GAN: {loss_g_gan.item():.4f}, L1: {loss_g_l1.item():.4f})"
+                )
+        avg_d = running_d_loss / len(loader)
+        avg_g = running_g_loss / len(loader)
+        print(
+            f"==> Epoch {epoch} finished | "
+            f"Avg D Loss: {avg_d:.4f} | Avg G Loss: {avg_g:.4f}"
+        )
+        # save checkpoint every few epochs
+        if epoch % 5 == 0:
+            ckpt_path = os.path.join(cfg.save_dir, f"seagan_epoch_{epoch}.pt")
+            save_checkpoint(epoch, G, D, opt_g, opt_d, ckpt_path)
+    # final save
+    ckpt_path = os.path.join(cfg.save_dir, f"seagan_final.pt")
+    save_checkpoint(cfg.num_epochs, G, D, opt_g, opt_d, ckpt_path)
+if __name__ == "__main__":
+    train()

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# app.py
+import os
+import io
+import uvicorn
+import torch
+import tempfile
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import FileResponse
+from starlette.middleware.cors import CORSMiddleware
+# --- Import your denoiser functions (adjust import if SEGAN.py is in subfolder) ---
+# from SEGAN import Config, STFTMagTransform, UNetGenerator
+# from your_denoiser_module import denoise_chunked_final, save_audio_as_png_lossless, load_audio_from_png_lossless, write_wav_from_tensor
+# For clarity, this file assumes denoise_chunked_final and packing functions are available in the `pipeline` module.
+from pipeline import InferConfig, denoise_chunked_final, save_audio_as_png_lossless, load_audio_from_png_lossless, write_wav_from_tensor
+# --- Config from env ---
+CHECKPOINT = os.environ.get("CHECKPOINT_PATH", "/app/checkpoints/seagan_final.pt")
+CHECKPOINT_URL = os.environ.get("CHECKPOINT_URL")  # optional: download at startup
+SAMPLE_RATE = int(os.environ.get("SAMPLE_RATE", "16000"))
+PNG_WIDTH = int(os.environ.get("PNG_WIDTH", "2048"))
+# Create directories
+os.makedirs("/app/data", exist_ok=True)
+os.makedirs("/app/checkpoints", exist_ok=True)
+os.makedirs("/tmp", exist_ok=True)
+app = FastAPI(title="SEGAN Denoise + PNG packer API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Download checkpoint if provided via URL and not present
+def ensure_checkpoint():
+    if os.path.isfile(CHECKPOINT):
+        print("Checkpoint exists:", CHECKPOINT)
+        return CHECKPOINT
+    if CHECKPOINT_URL:
+        import requests
+        print("Downloading checkpoint from URL...")
+        r = requests.get(CHECKPOINT_URL, stream=True, timeout=60)
+        if r.status_code != 200:
+            raise RuntimeError("Failed to download checkpoint; status=" + str(r.status_code))
+        outp = CHECKPOINT
+        os.makedirs(os.path.dirname(outp), exist_ok=True)
+        with open(outp, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print("Downloaded checkpoint to", outp)
+        return outp
+    raise FileNotFoundError("No checkpoint found; set CHECKPOINT_PATH or CHECKPOINT_URL environment variable.")
+# Initialize model config object (pipeline expects an InferConfig from your SEGAN code)
+icfg = InferConfig()  # make sure this respects ckpt path in env inside your class
+icfg.ckpt_path = CHECKPOINT
+@app.on_event("startup")
+def startup_event():
+    # ensure checkpoint present
+    try:
+        cp = ensure_checkpoint()
+    except Exception as e:
+        print("Warning: checkpoint not found at startup:", e)
+    print("Startup complete.")
+@app.post("/denoise-and-pack")
+async def denoise_and_pack(file: UploadFile = File(...)):
+    """
+    Accepts a WAV file upload. Returns a packed PNG containing lossless int16 PCM of denoised audio.
+    Form-data key: 'file'
+    """
+    # Accept only audio/wav or octet-stream
+    if file.content_type not in ("audio/wav", "audio/x-wav", "application/octet-stream"):
+        # still accept many clients — but warn
+        print("Warning: uploaded content_type:", file.content_type)
+    # Save upload to temp WAV file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
+        tmp_in.write(await file.read())
+        tmp_in.flush()
+        tmp_in_path = tmp_in.name
+    # Prepare output paths
+    base = os.path.splitext(os.path.basename(tmp_in_path))[0]
+    out_wav_path = f"/app/data/{base}_denoised.wav"
+    out_png_path = f"/app/data/{base}_packed.png"
+    # Run denoiser & packer (this function should save WAV and pack PNG; returns paths)
+    try:
+        print("Running denoiser for:", tmp_in_path)
+        # Denoser might be heavy — run on CPU if no GPU
+        out = denoise_chunked_final(tmp_in_path, out_wav_path, icfg,
+                                    chunk_seconds=50.0, overlap=0.5,
+                                    use_spectral_gate=True, noise_frac=0.1, subtract_strength=1.0)
+        # out may be (wav_path, png_path, recon_wav) depending on your pipeline
+    except Exception as e:
+        print("Denoiser error:", e)
+        raise HTTPException(status_code=500, detail="Denoiser failed: " + str(e))
+    # If your denoiser already wrote packed PNG, send that; else pack
+    if os.path.exists(out_png_path):
+        png_to_send = out_png_path
+    else:
+        # load denoised tensor (you may adapt this to how denoiser returns data)
+        # The pipeline.save_audio_as_png_lossless takes a tensor; if you only have file, use torchaudio.load
+        import torchaudio
+        wav, sr = torchaudio.load(out_wav_path)
+        if wav.size(0) > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        wav1d = wav.squeeze(0)
+        save_audio_as_png_lossless(wav1d, out_png_path, width=PNG_WIDTH)
+        png_to_send = out_png_path
+    return FileResponse(png_to_send, media_type="image/png", filename=os.path.basename(png_to_send))
+@app.post("/restore-from-png")
+async def restore_from_png(file: UploadFile = File(...)):
+    """
+    Accept a packed PNG upload and return restored WAV (mono int16) using SAMPLE_RATE env var.
+    """
+    if file.content_type not in ("image/png", "application/octet-stream"):
+        print("Warning: uploaded content_type:", file.content_type)
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_png:
+        tmp_png.write(await file.read())
+        tmp_png.flush()
+        tmp_png_path = tmp_png.name
+    try:
+        restored_tensor = load_audio_from_png_lossless(tmp_png_path, original_length=None)
+        out_wav = f"/app/data/restored_{os.path.basename(tmp_png_path)}.wav"
+        write_wav_from_tensor(restored_tensor, out_wav, SAMPLE_RATE)
+    except Exception as e:
+        print("Restore error:", e)
+        raise HTTPException(status_code=500, detail="Restore failed: " + str(e))
+    return FileResponse(out_wav, media_type="audio/wav", filename=os.path.basename(out_wav))
+# Optional simple healthcheck
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+# Run when invoked directly (development)
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=int(os.environ.get("PORT", 8000)))

checkpoints/seagan_final.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9641f1516a0123767e1684f85b09f9fc919949f7104983619bdb5088e815dae8
+size 384194538

pipeline.py ADDED Viewed

	@@ -0,0 +1,378 @@

+#!/usr/bin/env python3
+"""
+pipeline.py
+Contains:
+ - InferConfig (wraps your SEGAN.Config)
+ - denoise_chunked_final(...) -> denoised WAV path, packed PNG path, reconstructed WAV path
+ - save_audio_as_png_lossless / load_audio_from_png_lossless / write_wav_from_tensor
+ - helper utilities used by the denoiser (robust_save, mirror-pad, spectral gating)
+Usage: import the functions in your FastAPI `app.py` or run this file directly for a local test.
+Note: this module expects your SEGAN.py (containing Config, STFTMagTransform, UNetGenerator)
+to be available in the same directory or in PYTHONPATH. Adjust imports if needed.
+"""
+import os
+import math
+import wave
+import torch
+import torch.nn.functional as F
+import torchaudio
+import numpy as np
+from PIL import Image
+# Try to import SEGAN components - user must have SEGAN.py in same folder or package
+try:
+    from SEGAN import Config, STFTMagTransform, UNetGenerator
+except Exception as e:
+    # If import fails, raise a clear error when functions are used; keep module importable for tools that
+    # just want pack/unpack functions.
+    Config = None
+    STFTMagTransform = None
+    UNetGenerator = None
+    _import_error = e
+# ----------------- Configuration (defaults) -----------------
+DEFAULT_CHECKPOINT = os.environ.get("CHECKPOINT_PATH", "./checkpoints/seagan_final.pt")
+# ----------------- Infer config wrapper ---------------------
+class InferConfig:
+    """Simple wrapper for your SEGAN.Config. If SEGAN.Config is available we use it; else provide defaults.
+    Attributes expected by the pipeline: ckpt_path, device, n_fft, hop_length, win_length, sample_rate
+    """
+    def __init__(self,
+                 ckpt_path: str = DEFAULT_CHECKPOINT,
+                 device: str = "cuda" if torch.cuda.is_available() else "cpu",
+                 n_fft: int = 1024,
+                 hop_length: int = 256,
+                 win_length: int = 1024,
+                 sample_rate: int = 16000):
+        # If real SEGAN.Config exists, instantiate it and override ckpt_path + device
+        if Config is not None:
+            try:
+                cfg = Config()
+                cfg.ckpt_path = ckpt_path
+                cfg.device = device
+                # keep other fields from Config if present
+                self.__dict__.update(cfg.__dict__)
+                return
+            except Exception:
+                # fall through to default fields
+                pass
+        # fallback defaults
+        self.ckpt_path = ckpt_path
+        self.device = device
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sample_rate = sample_rate
+# ---------------- utilities -------------------
+def load_mono_resampled(path: str, target_sr: int):
+    wav, sr = torchaudio.load(path)
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    if sr != target_sr:
+        wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
+        sr = target_sr
+    return wav.squeeze(0)  # (T,)
+def robust_save(path: str, wav_tensor: torch.Tensor, sr: int):
+    x = wav_tensor.detach().cpu()
+    if x.dim() == 1:
+        x = x.unsqueeze(0)
+    while x.dim() > 2 and x.size(0) == 1:
+        x = x.squeeze(0)
+    if x.dim() > 2:
+        x = torch.squeeze(x)
+    if x.dim() == 1:
+        x = x.unsqueeze(0)
+    x = x.float()
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    torchaudio.save(path, x, sr)
+    print(f"Saved WAV: {path} (shape={tuple(x.shape)})")
+def pad_or_crop_freq(mag: torch.Tensor, target_F: int):
+    F_mag = mag.shape[1]
+    if F_mag == target_F:
+        return mag
+    if F_mag < target_F:
+        pad = target_F - F_mag
+        return F.pad(mag, (0, 0, 0, pad))
+    else:
+        return mag[:, :target_F, :]
+def mirror_pad_last_chunk(chunk: torch.Tensor, target_len: int):
+    L = chunk.shape[-1]
+    if L >= target_len:
+        return chunk[:, :, :target_len]
+    need = target_len - L
+    frag = chunk[..., -min(L, need):].flip(-1)
+    out = torch.cat([chunk, frag], dim=-1)
+    if out.shape[-1] < target_len:
+        out = F.pad(out, (0, target_len - out.shape[-1]))
+    return out[:, :, :target_len]
+# ---------------- spectral gating (final cleanup) ----------------
+def spectral_subtract_and_reconstruct(waveform: torch.Tensor, stft_mod, cfg: InferConfig,
+                                      noise_frac=0.1, subtract_strength=1.0, device='cpu'):
+    if waveform.dim() == 1:
+        wav = waveform.unsqueeze(0)  # (1, T)
+    else:
+        wav = waveform
+    wav = wav.to(device)
+    n_fft = cfg.n_fft
+    hop = cfg.hop_length
+    win = stft_mod.window.to(device) if stft_mod is not None else torch.hann_window(cfg.win_length).to(device)
+    spec = torch.stft(wav, n_fft=n_fft, hop_length=hop, win_length=cfg.win_length, window=win, return_complex=True)
+    mag = torch.abs(spec)     # (1, F, T)
+    phase = torch.angle(spec) # (1, F, T)
+    frame_energy = mag.pow(2).sum(dim=1).squeeze(0)  # (T,)
+    n_frames = frame_energy.shape[-1]
+    if n_frames <= 0:
+        return wav.squeeze(0).cpu()
+    k = max(1, int(n_frames * noise_frac))
+    idxs = torch.argsort(frame_energy)[:k]
+    noise_floor = mag[:, :, idxs].median(dim=-1).values  # (1, F)
+    noise_floor_exp = noise_floor.unsqueeze(-1).repeat(1, 1, mag.shape[-1])
+    alpha = subtract_strength
+    mag_sub = mag - alpha * noise_floor_exp
+    mag_sub = torch.clamp(mag_sub, min=0.0)
+    real = mag_sub * torch.cos(phase)
+    imag = mag_sub * torch.sin(phase)
+    complex_sub = torch.complex(real, imag)
+    recon = torch.istft(complex_sub, n_fft=n_fft, hop_length=hop, win_length=cfg.win_length, window=win, length=wav.shape[-1])
+    return recon.squeeze(0).cpu()
+# ---------------- core chunked denoiser (improved) ----------------
+def denoise_chunked_final(input_path: str, output_path: str, cfg: InferConfig,
+                          chunk_seconds=3.0, overlap=0.5,
+                          use_spectral_gate=True, noise_frac=0.1, subtract_strength=1.0,
+                          pack_png=True, png_width=2048):
+    """
+    Runs the chunked denoiser using the SEGAN generator.
+    Returns tuple: (out_wav_path, packed_png_path_or_None, recon_wav_path_or_None)
+    """
+    device = cfg.device
+    print("Device:", device)
+    # Check SEGAN availability
+    if UNetGenerator is None or STFTMagTransform is None or Config is None:
+        raise RuntimeError(f"SEGAN components not available. Original import error: {_import_error}")
+    # load model + stft
+    print("Loading checkpoint:", cfg.ckpt_path)
+    ckpt = torch.load(cfg.ckpt_path, map_location=device)
+    G = UNetGenerator(in_ch=1, out_ch=1).to(device)
+    G.load_state_dict(ckpt["G_state"])
+    G.eval()
+    stft = STFTMagTransform(cfg.n_fft, cfg.hop_length, cfg.win_length).to(device)
+    window = stft.window.to(device)
+    # load audio
+    wav = load_mono_resampled(input_path, cfg.sample_rate)  # (T,)
+    T = wav.shape[0]
+    sr = cfg.sample_rate
+    print(f"Input: {T} samples ({T/sr:.2f} s) SR={sr}")
+    chunk_samples = max(1, int(chunk_seconds * sr))
+    hop = max(1, int(chunk_samples * (1.0 - overlap)))
+    print(f"Chunk {chunk_samples} samples, hop {hop} samples")
+    out_len = T + chunk_samples
+    out_buffer = torch.zeros(out_len, dtype=torch.float32)
+    weight_buffer = torch.zeros(out_len, dtype=torch.float32)
+    synth_win = torch.hann_window(chunk_samples, periodic=True, dtype=torch.float32)
+    idx = 0
+    while idx < T:
+        start = idx
+        end = min(idx + chunk_samples, T)
+        chunk = wav[start:end].unsqueeze(0).unsqueeze(0).to(device)  # (1,1,L)
+        orig_len = chunk.shape[-1]
+        if orig_len < chunk_samples:
+            chunk = mirror_pad_last_chunk(chunk, chunk_samples).to(device)
+        with torch.no_grad():
+            spec = stft(chunk)             # (1,1,F_spec,Frames)
+            fake = G(spec)                 # (1,1,F_fake,Frames)
+            mag = torch.expm1(fake.clamp_min(0.0)).squeeze(1)  # (1,F_fake,Frames)
+        chunk_1d = chunk.view(1, -1)
+        complex_noisy = torch.stft(chunk_1d, n_fft=cfg.n_fft, hop_length=cfg.hop_length,
+                                   win_length=cfg.win_length, window=window, return_complex=True)
+        phase = torch.angle(complex_noisy)  # (1,F_phase,Frames_phase)
+        n_frames_mag = mag.shape[-1]
+        n_frames_phase = phase.shape[-1]
+        min_frames = min(n_frames_mag, n_frames_phase)
+        mag = mag[..., :min_frames]
+        phase = phase[..., :min_frames]
+        expected_F = cfg.n_fft // 2 + 1
+        mag = pad_or_crop_freq(mag, expected_F)
+        real = mag * torch.cos(phase)
+        imag = mag * torch.sin(phase)
+        complex_spec = torch.complex(real, imag).squeeze(0)  # (F, frames)
+        wav_rec = torch.istft(complex_spec.unsqueeze(0).to(device),
+                              n_fft=cfg.n_fft, hop_length=cfg.hop_length,
+                              win_length=cfg.win_length, window=window,
+                              length=chunk_samples).squeeze(0).cpu()
+        if wav_rec.shape[-1] < chunk_samples:
+            wav_rec = F.pad(wav_rec, (0, chunk_samples - wav_rec.shape[-1]))
+        elif wav_rec.shape[-1] > chunk_samples:
+            wav_rec = wav_rec[:chunk_samples]
+        win = synth_win.clone().cpu()
+        wav_rec_win = wav_rec * win
+        write_start = start
+        write_end = start + chunk_samples
+        out_buffer[write_start:write_end] += wav_rec_win
+        weight_buffer[write_start:write_end] += win
+        idx += hop
+    nonzero = weight_buffer > 1e-8
+    out_buffer[nonzero] = out_buffer[nonzero] / weight_buffer[nonzero]
+    denoised = out_buffer[:T].contiguous()
+    if use_spectral_gate:
+        print("Applying final spectral gating...")
+        denoised = spectral_subtract_and_reconstruct(denoised.unsqueeze(0), stft, cfg,
+                                                     noise_frac=noise_frac, subtract_strength=subtract_strength,
+                                                     device=cfg.device)
+    denoised = torch.clamp(denoised, -0.999, 0.999)
+    # save denoised wav
+    robust_save(output_path, denoised, sr)
+    packed_png = None
+    recon_wav = None
+    if pack_png:
+        packed_png = os.path.splitext(output_path)[0] + "_packed.png"
+        save_audio_as_png_lossless(denoised, packed_png, width=png_width)
+        print("Packed denoised audio into PNG:", packed_png)
+        # optional: reconstruct to verify
+        recon_wav = os.path.splitext(output_path)[0] + "_reconstructed_from_png.wav"
+        restored = load_audio_from_png_lossless(packed_png, original_length=denoised.shape[-1])
+        write_wav_from_tensor(restored, recon_wav, sr)
+        print("Reconstructed WAV from PNG:", recon_wav)
+    return output_path, packed_png, recon_wav
+# === Lossless audio <-> PNG packing (bit-perfect) ===
+def audio_tensor_to_int16_array(wav_tensor: torch.Tensor):
+    if isinstance(wav_tensor, torch.Tensor):
+        x = wav_tensor.detach().cpu().numpy()
+    else:
+        x = np.asarray(wav_tensor)
+    if x.ndim == 2 and x.shape[0] == 1:
+        x = x[0]
+    x = np.clip(x, -1.0, 1.0)
+    int16 = (x * 32767.0).astype(np.int16)
+    return int16
+def int16_array_to_audio_tensor(int16_arr: np.ndarray):
+    arr = np.asarray(int16_arr, dtype=np.int16)
+    float32 = (arr.astype(np.float32) / 32767.0)
+    return torch.from_numpy(float32)
+def save_audio_as_png_lossless(wav_tensor: torch.Tensor, png_path: str, width: int = 2048):
+    samples = audio_tensor_to_int16_array(wav_tensor)
+    N = samples.shape[0]
+    height = math.ceil(N / width)
+    total = width * height
+    pad = total - N
+    padded = np.pad(samples, (0, pad), mode='constant', constant_values=0).astype(np.int16)
+    arr = padded.reshape((height, width))
+    uint16_view = arr.view(np.uint16)
+    im = Image.fromarray(uint16_view, mode='I;16')
+    os.makedirs(os.path.dirname(png_path), exist_ok=True)
+    im.save(png_path, format='PNG')
+    print(f"Saved lossless audio PNG: {png_path} (samples={N}, width={width}, height={height})")
+    return png_path
+def load_audio_from_png_lossless(png_path: str, original_length: int = None):
+    im = Image.open(png_path)
+    arr_uint16 = np.array(im, dtype=np.uint16)
+    int16_arr = arr_uint16.view(np.int16).reshape(-1)
+    if original_length is not None:
+        int16_arr = int16_arr[:original_length]
+    float_tensor = int16_array_to_audio_tensor(int16_arr)
+    return float_tensor  # 1D torch tensor
+def write_wav_from_tensor(tensor: torch.Tensor, out_wav_path: str, sr: int):
+    x = tensor.detach().cpu().numpy()
+    int16 = (np.clip(x, -1.0, 1.0) * 32767.0).astype(np.int16)
+    os.makedirs(os.path.dirname(out_wav_path), exist_ok=True)
+    with wave.open(out_wav_path, 'wb') as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sr)
+        wf.writeframes(int16.tobytes())
+    print(f"WAV written (lossless restore): {out_wav_path} (samples={int16.size}, sr={sr})")
+    return out_wav_path
+# ----------------- CLI for quick local test -----------------
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Denoise WAV and pack into PNG (pipeline module)')
+    parser.add_argument('--input', '-i', required=True, help='Input WAV file path')
+    parser.add_argument('--output', '-o', required=False, help='Output denoised WAV path (default: input_den.wav)')
+    parser.add_argument('--checkpoint', '-c', required=False, help='Checkpoint path')
+    parser.add_argument('--png-width', type=int, default=2048)
+    parser.add_argument('--no-pack', dest='pack', action='store_false')
+    parser.set_defaults(pack=True)
+    args = parser.parse_args()
+    inp = args.input
+    out = args.output or os.path.splitext(inp)[0] + '_denoised.wav'
+    ckpt = args.checkpoint or DEFAULT_CHECKPOINT
+    cfg = InferConfig(ckpt_path=ckpt)
+    print('Running pipeline...')
+    try:
+        out_wav, packed_png, recon = denoise_chunked_final(inp, out, cfg, chunk_seconds=50.0, overlap=0.5,
+                                                           use_spectral_gate=True, noise_frac=0.1, subtract_strength=1.0,
+                                                           pack_png=args.pack, png_width=args.png_width)
+        print('Done.\n', out_wav, packed_png, recon)
+    except Exception as e:
+        print('Pipeline error:', e)
+        raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn[standard]
+torch==2.1.0+cpu
+torchaudio==2.1.0+cpu
+numpy
+pillow
+requests