import csv import hashlib import json import os import httpx from fastapi import FastAPI, Request from fastapi.middleware.gzip import GZipMiddleware from fastapi.responses import FileResponse, HTMLResponse, PlainTextResponse, RedirectResponse, Response, StreamingResponse from fastapi.staticfiles import StaticFiles from openai import OpenAI ENDPOINT_URL = os.environ.get( "ENDPOINT_URL", # NOTE: must end in /v1/ — the OpenAI SDK v1+ appends "completions" # directly to base_url with no auto /v1/ prefix. The HF dedicated # endpoint serves the OpenAI-compatible API at /v1/completions, so # without the suffix the SDK hits /completions and the endpoint # returns 404. Upstream commit 2831701 dropped the /v1/ but HF Spaces # masks this via an ENDPOINT_URL secret that includes it; running # locally with the default URL needs the suffix put back. "https://cr2l9w72ys5pp8le.us-east-1.aws.endpoints.huggingface.cloud/v1/", ) MODEL_NAME = os.environ.get( "MODEL_NAME", "HuggingFaceBio/Carbon-3B", ) # NVIDIA NIM ESMFold endpoint (alignment-free protein structure prediction). # Schema: POST {"sequence": ""} → {"pdbs": [""]}. # Constraints: max 1024 aa, charset = 20 standard AAs only. NIM_FOLD_URL = os.environ.get( "NIM_FOLD_URL", "https://health.api.nvidia.com/v1/biology/nvidia/esmfold", ) FOLD_MAX_LEN = 1024 FOLD_AA_ALPHABET = "ARNDCQEGHILKMFPSTWYV" # In-memory cache: sha1(sequence) → result dict. ESMFold is deterministic at # temperature 0, so caching is safe and lets demo viewers replay the same # protein for free. Bounded to keep memory predictable on long-running Spaces. _FOLD_CACHE: dict[str, dict] = {} _FOLD_CACHE_MAX = 256 HERE = os.path.dirname(os.path.abspath(__file__)) # Absolute base URL used to fill {{SITE_URL}} placeholders in demo.html, # sitemap.xml and robots.txt (og:image, canonical, sitemap reference…). # If unset, we derive it per-request from the X-Forwarded-* headers (HF # Spaces sits behind a proxy that sets them) so og:image, canonical and # the sitemap stay correct on whatever host the page is served from. SITE_URL_ENV = os.environ.get("SITE_URL", "").rstrip("/") def site_url_for(request: Request) -> str: """Return the absolute origin (scheme://host, no trailing slash).""" if SITE_URL_ENV: return SITE_URL_ENV scheme = request.headers.get("x-forwarded-proto") or request.url.scheme or "http" # X-Forwarded-Host may carry a comma-separated chain when multiple # proxies are involved; the original client-visible host is the # first entry. Host header is the fallback. fwd_host = request.headers.get("x-forwarded-host") if fwd_host: host = fwd_host.split(",")[0].strip() else: host = request.headers.get("host") or request.url.netloc return f"{scheme}://{host}" def _load_text(path: str) -> str: with open(path, encoding="utf-8") as f: return f.read() # Templates loaded once at startup. demo.html and social-banner.html are # large; reading them on every request would add ~100 us of syscall + # parse overhead each time, which adds up under load. The substitution # itself (a single str.replace) is cheap. # # DEV=1 disables the cache and re-reads from disk on every request so # edits to demo.html / social-banner.html / robots / sitemap / llms show # up on the next reload without restarting the server. DEV = bool(os.environ.get("DEV")) _TEMPLATE_PATHS = { "demo": os.path.join(HERE, "demo.html"), "social_banner": os.path.join(HERE, "social-banner.html"), "robots": os.path.join(HERE, "robots.txt"), "sitemap": os.path.join(HERE, "sitemap.xml"), "llms": os.path.join(HERE, "llms.txt"), } _TEMPLATE_CACHE = {name: _load_text(path) for name, path in _TEMPLATE_PATHS.items()} def template(name: str) -> str: if DEV: return _load_text(_TEMPLATE_PATHS[name]) return _TEMPLATE_CACHE[name] def render(template: str, site_url: str) -> str: return template.replace("{{SITE_URL}}", site_url) def get_api_key(): key = os.environ.get("HF_TOKEN") if key: return key try: from huggingface_hub import get_token return get_token() except Exception: return None def left_pad_to_six(seq: str) -> tuple[str, int]: """Prepend 'A's so the DNA length is a multiple of 6 (Carbon's BPE token width). Without padding, the endpoint right-pads with 'A's, which means the model's next-token prediction is conditioned on phantom 'A's *at the end* of the immediate context — exactly the part that influences the next prediction most. Left-padding instead pushes the phantom bases into the older context so the user's actual prompt is what the model sees right before the prediction boundary. Returns (padded_sequence, n_phantom_bases_prepended). """ if not seq: return seq, 0 rem = len(seq) % 6 if rem == 0: return seq, 0 n_pad = 6 - rem return ("A" * n_pad) + seq, n_pad app = FastAPI() # Compress responses >= 1 KB. Mostly aimed at /umap (~4 MB binary blob # → ~2 MB on the wire) and the JSON gene/variant/species catalogs. # compresslevel=6 is the gzip(1) system default — within ~3% of level 9 # in ratio but ~5x cheaper in CPU. Worth it on every request. app.add_middleware(GZipMiddleware, minimum_size=1024, compresslevel=6) app.mount("/img", StaticFiles(directory=os.path.join(HERE, "img")), name="img") # Modular CSS / JS for demo.html. demo.html used to be a 6 kLOC monolith # with a single inline