mztapiv2

Running

App Files Files Community

SalexAI commited on Feb 20

Commit

13c0c13

verified ·

1 Parent(s): b320cf4

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -123

app.py CHANGED Viewed

@@ -1,15 +1,30 @@
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
 import os, json, base64, asyncio
 import httpx
 from pathlib import Path
-from huggingface_hub import CommitScheduler
-from huggingface_hub import hf_hub_download
-from json import JSONDecodeError
-from urllib.parse import quote, unquote
 # ==================================================
 # APP SETUP
@@ -52,8 +67,7 @@ ALL_TOURS = [
     "Epic + Atlantic 1",
     "Epic + Maritimes 1",
     "Atlantic 1"
-];
 ADMIN_TOKEN = os.environ.get("ADMIN_TOKEN")
 GOOGLE_KEY  = os.environ.get("GOOGLE_KEY")
@@ -62,49 +76,24 @@ HF_TOKEN    = os.environ.get("HF_TOKEN")
 SHEET_ID = "1o0AUq13j-7LZWDhCwFYgq07niZtvOya5iE5bbRQMGWc"
 DATASET_REPO = "SalexAI/mztimgs"   # 👈 change if needed
 DATASET_DIR = Path("dataset_cache")
 DATASET_DIR.mkdir(parents=True, exist_ok=True)
 if not ADMIN_TOKEN:
     print("⚠️  WARNING: ADMIN_TOKEN not set")
 if not GOOGLE_KEY:
     print("⚠️  WARNING: GOOGLE_KEY not set")
-from urllib.parse import quote
-def normalize_tour(tour: str) -> str:
-    return unquote(tour).strip()
-async def fetch_from_hf(tour: str) -> dict | None:
-    """
-    Correct HF fetch:
-    - filenames are ALWAYS decoded
-    - HF handles URL encoding internally
-    """
-    filename = f"{tour}.json"
-    print("🔍 HF HUB DOWNLOAD TRY:", filename)
-    try:
-        path = await asyncio.to_thread(
-            hf_hub_download,
-            repo_id=DATASET_REPO,
-            repo_type="dataset",
-            filename=f"data/{filename}",  # ← decoded, with spaces
-            token=HF_TOKEN,
-        )
-        print("⬇️ HF HUB DOWNLOADED:", path)
-        with open(path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    except Exception as e:
-        print("❌ HF HUB DOWNLOAD FAILED:", str(e))
-        return None
 # ==================================================
 # HF DATASET COMMIT SCHEDULER
@@ -113,6 +102,9 @@ scheduler = CommitScheduler(
     repo_id=DATASET_REPO,
     repo_type="dataset",
     folder_path=DATASET_DIR,
     path_in_repo="data",
     token=HF_TOKEN,
 )
@@ -120,53 +112,39 @@ scheduler = CommitScheduler(
 # ==================================================
 # HELPERS
 # ==================================================
-import re
 def has_images(data: dict) -> bool:
     imgs = data.get("images", {})
-    return bool(
-        imgs.get("banner") or
-        imgs.get("cover") or
-        imgs.get("carousel")
-    )
 def get_fallback_tours(requested: str) -> list[str]:
-    """
-    If 'Maritimes' is requested → returns ['Maritimes 1', 'Maritimes 2']
-    If 'Haida Gwaii' → ['Haida Gwaii 1' ...]
-    Otherwise empty list
-    """
     base = requested.strip()
-    # If request already ends in a number, don't fallback
     if re.search(r"\s\d+$", base):
         return []
-    matches = []
-    for t in ALL_TOURS:
-        if t.startswith(base + " "):
-            matches.append(t)
-    # Sort numerically (1,2,3 not 1,10,2)
-    def tour_num(name):
         m = re.search(r"(\d+)$", name)
         return int(m.group(1)) if m else 0
     return sorted(matches, key=tour_num)
 def empty_structure():
-    return {
-        "images": {
-            "banner": "",
-            "cover": "",
-            "carousel": []
-        }
-    }
 def tour_path(tour: str) -> Path:
     return DATASET_DIR / f"{tour}.json"
 def load_json(path: Path) -> dict:
     if not path.exists():
         return empty_structure()
@@ -177,57 +155,62 @@ def save_json(path: Path, data: dict):
     with path.open("w", encoding="utf-8") as f:
         json.dump(data, f, indent=2)
-def require_admin(token: str):
-    if not ADMIN_TOKEN or token != ADMIN_TOKEN:
-        raise HTTPException(status_code=403, detail="Invalid admin token")
-# ==================================================
-# GET IMAGE JSON
-# ==================================================
-@app.get("/imageget/{tour}.json")
-async def get_images(tour: str):
-    tour = normalize_tour(tour)
-    # 1️⃣ Try exact match first (current behavior)
-    path = tour_path(tour)
-    if path.exists():
-        data = load_json(path)
-        if has_images(data):
-            return data
-    data = await fetch_from_hf(tour)
-    if data and has_images(data):
-        save_json(path, data)
-        return data
-    # 2️⃣ Fallback to numbered tours (NEW)
-    for alt in get_fallback_tours(tour):
-        alt_path = tour_path(alt)
-        if alt_path.exists():
-            alt_data = load_json(alt_path)
-            if has_images(alt_data):
-                return alt_data
-        alt_data = await fetch_from_hf(alt)
-        if alt_data and has_images(alt_data):
-            save_json(alt_path, alt_data)
-            return alt_data
-    # 3️⃣ Nothing found
-    return empty_structure()
 @app.get("/")
 async def root_status():
     tours = []
     for path in DATASET_DIR.glob("*.json"):
         try:
             with path.open("r", encoding="utf-8") as f:
                 data = json.load(f)
             images = data.get("images", {})
             banner = bool(images.get("banner"))
             cover = bool(images.get("cover"))
             carousel_count = len(images.get("carousel", []))
@@ -237,22 +220,55 @@ async def root_status():
                 "banner": banner,
                 "cover": cover,
                 "carousel": carousel_count,
-                "total_images": int(banner) + int(cover) + carousel_count
             })
         except Exception as e:
-            tours.append({
-                "tour": path.stem,
-                "error": str(e)
-            })
     return {
         "status": "ok",
-        "service": "Mile Zero Tours Image API",
         "cached_tours": len(tours),
-        "tours": sorted(tours, key=lambda t: t.get("tour", ""))
     }
 # ==================================================
 # UPLOAD IMAGE
 # ==================================================
@@ -265,6 +281,7 @@ async def upload_image(
     base64_data: str = Form(None),
 ):
     require_admin(admin_token)
     if slot not in ("banner", "cover", "carousel"):
         raise HTTPException(status_code=400, detail="Invalid slot")
@@ -290,12 +307,7 @@ async def upload_image(
         save_json(path, data)
-    return {
-        "ok": True,
-        "tour": tour,
-        "slot": slot,
-        "carousel_len": len(data["images"]["carousel"]),
-    }
 # ==================================================
 # DELETE IMAGE
@@ -308,6 +320,7 @@ async def delete_image(
     index: int = Form(None),
 ):
     require_admin(admin_token)
     path = tour_path(tour)
@@ -327,6 +340,129 @@ async def delete_image(
     return {"ok": True}
 # ==================================================
 # GOOGLE SHEETS PROXY (NO KEY LEAK)
 # ==================================================
@@ -347,7 +483,7 @@ async def proxy_google_sheets(range: str):
     if r.status_code != 200:
         return JSONResponse(
             status_code=r.status_code,
-            content={"error": "Google Sheets fetch failed"}
         )
-    return r.json()

+"""
+Mile Zero Tours Image API + PDF Upload API (FastAPI)
+Adds:
+- /pdfupload/{tour}   (admin-only) upload a PDF file (multipart)
+- /pdfget/{tour}.pdf  (public)      fetch PDF (cached -> HF dataset -> 404 if none)
+- /pdfdelete/{tour}   (admin-only)  delete cached PDF (and schedule HF commit)
+Storage:
+- JSON stays in: dataset_cache/{tour}.json  -> committed to HF under data/
+- PDFs go in:  dataset_cache/pdfs/{tour}.pdf -> committed to HF under pdfs/
+Notes:
+- Keeps your existing image endpoints untouched (except minor imports cleanup).
+- Uses your existing CommitScheduler to commit both JSON + PDFs (same scheduler).
+- Enforces .pdf extension + content-type check + max size (configurable).
+"""
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response
 import os, json, base64, asyncio
 import httpx
 from pathlib import Path
+from huggingface_hub import CommitScheduler, hf_hub_download
+from urllib.parse import unquote
+import re
 # ==================================================
 # APP SETUP
     "Epic + Atlantic 1",
     "Epic + Maritimes 1",
     "Atlantic 1"
+]
 ADMIN_TOKEN = os.environ.get("ADMIN_TOKEN")
 GOOGLE_KEY  = os.environ.get("GOOGLE_KEY")
 SHEET_ID = "1o0AUq13j-7LZWDhCwFYgq07niZtvOya5iE5bbRQMGWc"
 DATASET_REPO = "SalexAI/mztimgs"   # 👈 change if needed
+# Local cache folder that CommitScheduler watches
 DATASET_DIR = Path("dataset_cache")
 DATASET_DIR.mkdir(parents=True, exist_ok=True)
+# PDF cache folder (also inside DATASET_DIR so scheduler can commit it)
+PDF_DIR = DATASET_DIR / "pdfs"
+PDF_DIR.mkdir(parents=True, exist_ok=True)
+# PDF constraints
+MAX_PDF_BYTES = int(os.environ.get("MAX_PDF_BYTES", str(25 * 1024 * 1024)))  # default 25MB
 if not ADMIN_TOKEN:
     print("⚠️  WARNING: ADMIN_TOKEN not set")
 if not GOOGLE_KEY:
     print("⚠️  WARNING: GOOGLE_KEY not set")
+if not HF_TOKEN:
+    print("⚠️  WARNING: HF_TOKEN not set (HF downloads/commits may fail)")
 # ==================================================
 # HF DATASET COMMIT SCHEDULER
     repo_id=DATASET_REPO,
     repo_type="dataset",
     folder_path=DATASET_DIR,
+    # Everything inside dataset_cache will be committed under this folder in the repo.
+    # So: dataset_cache/Foo.json -> data/Foo.json
+    #     dataset_cache/pdfs/Foo.pdf -> data/pdfs/Foo.pdf
     path_in_repo="data",
     token=HF_TOKEN,
 )
 # ==================================================
 # HELPERS
 # ==================================================
+def normalize_tour(tour: str) -> str:
+    return unquote(tour).strip()
+def require_admin(token: str):
+    if not ADMIN_TOKEN or token != ADMIN_TOKEN:
+        raise HTTPException(status_code=403, detail="Invalid admin token")
 def has_images(data: dict) -> bool:
     imgs = data.get("images", {})
+    return bool(imgs.get("banner") or imgs.get("cover") or imgs.get("carousel"))
 def get_fallback_tours(requested: str) -> list[str]:
     base = requested.strip()
     if re.search(r"\s\d+$", base):
         return []
+    matches = [t for t in ALL_TOURS if t.startswith(base + " ")]
+    def tour_num(name: str) -> int:
         m = re.search(r"(\d+)$", name)
         return int(m.group(1)) if m else 0
     return sorted(matches, key=tour_num)
 def empty_structure():
+    return {"images": {"banner": "", "cover": "", "carousel": []}}
 def tour_path(tour: str) -> Path:
     return DATASET_DIR / f"{tour}.json"
+def pdf_path(tour: str) -> Path:
+    return PDF_DIR / f"{tour}.pdf"
 def load_json(path: Path) -> dict:
     if not path.exists():
         return empty_structure()
     with path.open("w", encoding="utf-8") as f:
         json.dump(data, f, indent=2)
+async def fetch_from_hf_json(tour: str) -> dict | None:
+    filename = f"{tour}.json"
+    print("🔍 HF HUB DOWNLOAD TRY (json):", filename)
+    try:
+        path = await asyncio.to_thread(
+            hf_hub_download,
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            filename=f"data/{filename}",
+            token=HF_TOKEN,
+        )
+        print("⬇️ HF HUB DOWNLOADED (json):", path)
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as e:
+        print("❌ HF HUB DOWNLOAD FAILED (json):", str(e))
+        return None
+async def fetch_from_hf_pdf_bytes(tour: str) -> bytes | None:
+    filename = f"{tour}.pdf"
+    print("🔍 HF HUB DOWNLOAD TRY (pdf):", filename)
+    try:
+        path = await asyncio.to_thread(
+            hf_hub_download,
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            filename=f"data/pdfs/{filename}",
+            token=HF_TOKEN,
+        )
+        print("⬇️ HF HUB DOWNLOADED (pdf):", path)
+        with open(path, "rb") as f:
+            return f.read()
+    except Exception as e:
+        print("❌ HF HUB DOWNLOAD FAILED (pdf):", str(e))
+        return None
+def sniff_pdf(raw: bytes) -> bool:
+    # PDFs start with: %PDF-
+    return raw[:5] == b"%PDF-"
+# ==================================================
+# ROUTES
+# ==================================================
 @app.get("/")
 async def root_status():
     tours = []
     for path in DATASET_DIR.glob("*.json"):
         try:
             with path.open("r", encoding="utf-8") as f:
                 data = json.load(f)
             images = data.get("images", {})
             banner = bool(images.get("banner"))
             cover = bool(images.get("cover"))
             carousel_count = len(images.get("carousel", []))
                 "banner": banner,
                 "cover": cover,
                 "carousel": carousel_count,
+                "total_images": int(banner) + int(cover) + carousel_count,
+                "has_pdf": pdf_path(path.stem).exists(),
             })
         except Exception as e:
+            tours.append({"tour": path.stem, "error": str(e)})
     return {
         "status": "ok",
+        "service": "Mile Zero Tours Image + PDF API",
         "cached_tours": len(tours),
+        "tours": sorted(tours, key=lambda t: t.get("tour", "")),
     }
+# ==================================================
+# GET IMAGE JSON
+# ==================================================
+@app.get("/imageget/{tour}.json")
+async def get_images(tour: str):
+    tour = normalize_tour(tour)
+    # 1) exact cache
+    path = tour_path(tour)
+    if path.exists():
+        data = load_json(path)
+        if has_images(data):
+            return data
+    # 2) exact HF
+    data = await fetch_from_hf_json(tour)
+    if data and has_images(data):
+        save_json(path, data)
+        return data
+    # 3) fallback numbered tours
+    for alt in get_fallback_tours(tour):
+        alt_path = tour_path(alt)
+        if alt_path.exists():
+            alt_data = load_json(alt_path)
+            if has_images(alt_data):
+                return alt_data
+        alt_data = await fetch_from_hf_json(alt)
+        if alt_data and has_images(alt_data):
+            save_json(alt_path, alt_data)
+            return alt_data
+    return empty_structure()
 # ==================================================
 # UPLOAD IMAGE
 # ==================================================
     base64_data: str = Form(None),
 ):
     require_admin(admin_token)
+    tour = normalize_tour(tour)
     if slot not in ("banner", "cover", "carousel"):
         raise HTTPException(status_code=400, detail="Invalid slot")
         save_json(path, data)
+    return {"ok": True, "tour": tour, "slot": slot, "carousel_len": len(data["images"]["carousel"])}
 # ==================================================
 # DELETE IMAGE
     index: int = Form(None),
 ):
     require_admin(admin_token)
+    tour = normalize_tour(tour)
     path = tour_path(tour)
     return {"ok": True}
+# ==================================================
+# PDF UPLOAD (NEW)
+# ==================================================
+@app.post("/pdfupload/{tour}")
+async def upload_pdf(
+    tour: str,
+    admin_token: str = Form(...),
+    file: UploadFile = File(...),
+):
+    require_admin(admin_token)
+    tour = normalize_tour(tour)
+    # Basic content-type / filename checks (clients are often inconsistent, so we verify bytes too)
+    filename = (file.filename or "").lower()
+    if not (filename.endswith(".pdf") or file.content_type == "application/pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    raw = await file.read()
+    if len(raw) == 0:
+        raise HTTPException(status_code=400, detail="Empty file")
+    if len(raw) > MAX_PDF_BYTES:
+        raise HTTPException(status_code=413, detail=f"PDF too large (max {MAX_PDF_BYTES} bytes)")
+    if not sniff_pdf(raw):
+        raise HTTPException(status_code=400, detail="File does not look like a valid PDF")
+    path = pdf_path(tour)
+    with scheduler.lock:
+        path.write_bytes(raw)
+    return {
+        "ok": True,
+        "tour": tour,
+        "bytes": len(raw),
+        "pdf": f"/pdfget/{tour}.pdf",
+    }
+# ==================================================
+# PDF GET (NEW)
+# ==================================================
+@app.get("/pdfget/{tour}.pdf")
+async def get_pdf(tour: str):
+    tour = normalize_tour(tour)
+    # 1) local cache
+    path = pdf_path(tour)
+    if path.exists():
+        raw = path.read_bytes()
+        return Response(
+            content=raw,
+            media_type="application/pdf",
+            headers={
+                "Cache-Control": "public, max-age=300",
+                "Content-Disposition": f'inline; filename="{tour}.pdf"',
+            },
+        )
+    # 2) HF fetch -> cache -> return
+    raw = await fetch_from_hf_pdf_bytes(tour)
+    if raw:
+        # safety: avoid caching huge or non-pdf blobs
+        if len(raw) <= MAX_PDF_BYTES and sniff_pdf(raw):
+            with scheduler.lock:
+                path.write_bytes(raw)
+        return Response(
+            content=raw,
+            media_type="application/pdf",
+            headers={
+                "Cache-Control": "public, max-age=300",
+                "Content-Disposition": f'inline; filename="{tour}.pdf"',
+            },
+        )
+    # 3) fallback numbered tours (optional)
+    for alt in get_fallback_tours(tour):
+        alt_path = pdf_path(alt)
+        if alt_path.exists():
+            raw2 = alt_path.read_bytes()
+            return Response(
+                content=raw2,
+                media_type="application/pdf",
+                headers={
+                    "Cache-Control": "public, max-age=300",
+                    "Content-Disposition": f'inline; filename="{alt}.pdf"',
+                },
+            )
+        raw2 = await fetch_from_hf_pdf_bytes(alt)
+        if raw2:
+            if len(raw2) <= MAX_PDF_BYTES and sniff_pdf(raw2):
+                with scheduler.lock:
+                    alt_path.write_bytes(raw2)
+            return Response(
+                content=raw2,
+                media_type="application/pdf",
+                headers={
+                    "Cache-Control": "public, max-age=300",
+                    "Content-Disposition": f'inline; filename="{alt}.pdf"',
+                },
+            )
+    raise HTTPException(status_code=404, detail="PDF not found")
+# ==================================================
+# PDF DELETE (NEW)
+# ==================================================
+@app.post("/pdfdelete/{tour}")
+async def delete_pdf(
+    tour: str,
+    admin_token: str = Form(...),
+):
+    require_admin(admin_token)
+    tour = normalize_tour(tour)
+    path = pdf_path(tour)
+    with scheduler.lock:
+        if path.exists():
+            path.unlink()
+    return {"ok": True, "tour": tour, "deleted": True}
 # ==================================================
 # GOOGLE SHEETS PROXY (NO KEY LEAK)
 # ==================================================
     if r.status_code != 200:
         return JSONResponse(
             status_code=r.status_code,
+            content={"error": "Google Sheets fetch failed"},
         )
+    return r.json()