vero_back_test2

Sleeping

App Files Files Community

omgy commited on Oct 30, 2025

Commit

ec56450

verified ·

1 Parent(s): 07738ea

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +43 -0
app.py +454 -0
requirements.txt +18 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# syntax=docker/dockerfile:1.6
+# Base Python image
+FROM python:3.11-slim
+# Environment for reliable, quiet, and unbuffered Python
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PORT=7860
+# Install system packages required for OCR and PDF rasterization
+# - tesseract-ocr and language data (eng)
+# - poppler-utils provides `pdftoppm` used by pdf2image
+# - libgl1 needed by some Pillow operations in headless containers
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        tesseract-ocr \
+        tesseract-ocr-eng \
+        poppler-utils \
+        libgl1 \
+    && rm -rf /var/lib/apt/lists/*
+# Set tessdata path (generally correct for Debian-based images)
+ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
+# App directory
+WORKDIR /app
+# Install Python dependencies first for better layer caching
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+# Copy application code
+COPY . /app
+# Expose default HF Spaces port
+EXPOSE 7860
+# Start the FastAPI server
+# Note: Hugging Face sets PORT env var, but we default to 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import io
+import os
+import uuid
+import json
+import shutil
+import logging
+import mimetypes
+import tempfile
+from typing import Optional, Tuple
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
+# ---------------------------------------------------------------------------------------
+# App setup
+# ---------------------------------------------------------------------------------------
+app = FastAPI(
+    title="Document Enhancer Backend",
+    description=(
+        "A FastAPI backend suitable for Hugging Face Spaces. "
+        "It accepts a document and a prompt, extracts text/layout, calls Gemini for edits, "
+        "and rebuilds a document with the requested tweaks. "
+        "Note: Full layout preservation with OCR is complex; this entrypoint provides a working scaffold."
+    ),
+    version="0.1.0",
+)
+# Allow CORS for web UIs hosted elsewhere
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Restrict in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+logger = logging.getLogger("uvicorn.error")
+# ---------------------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------------------
+def _in_spaces() -> bool:
+    # Heuristic env flag when running on Hugging Face Space
+    return bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE") or os.getenv("SYSTEM"))
+def _env_info() -> dict:
+    return {
+        "running_in_spaces": _in_spaces(),
+        "python": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
+        "gemini_api_key_set": bool(os.getenv("GEMINI_API_KEY")),
+        "tesseract_cmd": os.getenv("TESSERACT_CMD"),
+        "tessdata_prefix": os.getenv("TESSDATA_PREFIX"),
+    }
+def _safe_import(module_name: str):
+    try:
+        module = __import__(module_name)
+        return module
+    except Exception as e:
+        logger.warning(
+            "Optional dependency not available: %s (%s)", module_name, str(e)
+        )
+        return None
+def _detect_file_kind(filename: str, content_type: Optional[str]) -> str:
+    name = (filename or "").lower()
+    if content_type:
+        ct = content_type.lower()
+        if "pdf" in ct:
+            return "pdf"
+        if "word" in ct or "docx" in ct or name.endswith(".docx"):
+            return "docx"
+        if "image" in ct or any(
+            name.endswith(ext)
+            for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"]
+        ):
+            return "image"
+        if "text" in ct or name.endswith(".txt"):
+            return "text"
+    # Fallback by extension
+    if name.endswith(".pdf"):
+        return "pdf"
+    if name.endswith(".docx"):
+        return "docx"
+    if any(
+        name.endswith(ext)
+        for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"]
+    ):
+        return "image"
+    if name.endswith(".txt"):
+        return "text"
+    return "unknown"
+def _read_bytes(file: UploadFile, max_size_mb: int = 40) -> bytes:
+    """
+    Read the uploaded file into memory with a soft limit to avoid crashing small instances.
+    """
+    limit = max_size_mb * 1024 * 1024
+    buf = io.BytesIO()
+    total = 0
+    while True:
+        chunk = file.file.read(1024 * 1024)
+        if not chunk:
+            break
+        total += len(chunk)
+        if total > limit:
+            raise HTTPException(
+                413, detail=f"File too large. Max allowed is {max_size_mb} MB."
+            )
+        buf.write(chunk)
+    return buf.getvalue()
+def _save_to_temp(data: bytes, suffix: str) -> str:
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    with open(path, "wb") as f:
+        f.write(data)
+    return path
+# ---------------------------------------------------------------------------------------
+# Extraction
+# ---------------------------------------------------------------------------------------
+def extract_text_and_layout(
+    temp_path: str, file_kind: str
+) -> Tuple[str, Optional[str]]:
+    """
+    Best-effort extractor:
+    - PDF: try pdfminer for digital text; if empty, try OCR via pytesseract + pdf2image.
+    - DOCX: extract paragraphs via python-docx.
+    - IMAGE: OCR via pytesseract.
+    - TEXT: read as text.
+    Returns (plain_text, layout_info) where layout_info may be hOCR or None.
+    """
+    plain_text = ""
+    layout_info = None
+    if file_kind == "pdf":
+        pdfminer = _safe_import("pdfminer")
+        if pdfminer:
+            try:
+                from pdfminer.high_level import extract_text
+                plain_text = extract_text(temp_path) or ""
+            except Exception as e:
+                logger.warning("pdfminer failed: %s", str(e))
+        if not plain_text.strip():
+            # Try OCR
+            try:
+                pytesseract = _safe_import("pytesseract")
+                pdf2image = _safe_import("pdf2image")
+                if not pytesseract or not pdf2image:
+                    raise RuntimeError(
+                        "OCR dependencies (pytesseract/pdf2image) not available"
+                    )
+                from pdf2image import convert_from_path
+                images = convert_from_path(temp_path, dpi=300)
+                ocr_texts = []
+                hocr_blobs = []
+                for img in images:
+                    ocr_texts.append(pytesseract.image_to_string(img))
+                    hocr_blobs.append(
+                        pytesseract.image_to_pdf_or_hocr(img, extension="hocr").decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                plain_text = "\n".join(ocr_texts)
+                layout_info = "\n".join(hocr_blobs)
+            except Exception as e:
+                logger.warning("PDF OCR failed: %s", str(e))
+                if not plain_text:
+                    plain_text = ""
+    elif file_kind == "docx":
+        docx = _safe_import("docx")
+        if not docx:
+            raise HTTPException(
+                500, detail="python-docx not installed; cannot process DOCX"
+            )
+        try:
+            from docx import Document
+            doc = Document(temp_path)
+            plain_text = "\n".join([p.text for p in doc.paragraphs])
+        except Exception as e:
+            logger.error("DOCX extract failed: %s", str(e))
+            raise HTTPException(500, detail=f"Failed to read DOCX: {str(e)}")
+    elif file_kind == "image":
+        try:
+            pytesseract = _safe_import("pytesseract")
+            PIL = _safe_import("PIL")
+            if not pytesseract or not PIL:
+                raise RuntimeError(
+                    "OCR dependencies (pytesseract/Pillow) not available"
+                )
+            from PIL import Image
+            img = Image.open(temp_path)
+            plain_text = pytesseract.image_to_string(img)
+            layout_info = pytesseract.image_to_pdf_or_hocr(
+                img, extension="hocr"
+            ).decode("utf-8", errors="ignore")
+        except Exception as e:
+            logger.error("Image OCR failed: %s", str(e))
+            raise HTTPException(500, detail=f"OCR failed: {str(e)}")
+    elif file_kind == "text":
+        try:
+            with open(temp_path, "rb") as f:
+                raw = f.read()
+            # Try utf-8 first
+            try:
+                plain_text = raw.decode("utf-8")
+            except UnicodeDecodeError:
+                plain_text = raw.decode("latin-1", errors="ignore")
+        except Exception as e:
+            raise HTTPException(500, detail=f"Failed to read text: {str(e)}")
+    else:
+        raise HTTPException(
+            415, detail="Unsupported file type. Please upload PDF, DOCX, image, or TXT."
+        )
+    return plain_text, layout_info
+# ---------------------------------------------------------------------------------------
+# Gemini Integration
+# ---------------------------------------------------------------------------------------
+async def gemini_edit_text(prompt: str, text: str) -> str:
+    """
+    Call Gemini 2.5 Flash to transform text according to prompt.
+    Falls back to echo if GEMINI_API_KEY is not set.
+    """
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        # Fallback: return the original text with an annotation.
+        logger.warning("GEMINI_API_KEY not set; returning original text as fallback")
+        return f"{text}\n\n[Note: GEMINI_API_KEY not set. This is a fallback output without AI edits.]"
+    httpx = _safe_import("httpx")
+    if not httpx:
+        logger.warning("httpx not installed; returning original text as fallback")
+        return f"{text}\n\n[Note: httpx not installed. This is a fallback output without AI edits.]"
+    import httpx as _httpx  # type: ignore
+    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
+    params = {"key": api_key}
+    # Keep prompt and doc text concise enough for free tier
+    user_text = f"Instruction:\n{prompt}\n\nDocument:\n{text}"
+    payload = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [{"text": user_text[:800000]}],
+            }  # basic truncation safety
+        ]
+    }
+    try:
+        async with _httpx.AsyncClient(timeout=120) as client:
+            resp = await client.post(url, params=params, json=payload)
+            if resp.status_code != 200:
+                detail = resp.text
+                logger.error("Gemini API error %s: %s", resp.status_code, detail)
+                raise HTTPException(502, detail=f"Gemini API error: {detail}")
+            data = resp.json()
+            # Parse response
+            candidates = data.get("candidates", [])
+            if not candidates:
+                logger.error(
+                    "Gemini returned no candidates: %s", json.dumps(data)[:500]
+                )
+                raise HTTPException(502, detail="Gemini returned no candidates")
+            parts = candidates[0].get("content", {}).get("parts", [])
+            if not parts:
+                logger.error("Gemini returned no parts: %s", json.dumps(data)[:500])
+                raise HTTPException(502, detail="Gemini returned empty response")
+            out_text = parts[0].get("text", "")
+            if not out_text.strip():
+                logger.warning("Gemini returned empty text; using original")
+                return text
+            return out_text
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Gemini call failed: %s", str(e))
+        raise HTTPException(502, detail=f"Gemini call failed: {str(e)}")
+# ---------------------------------------------------------------------------------------
+# Rebuilding document
+# ---------------------------------------------------------------------------------------
+def build_docx_from_text(modified_text: str) -> bytes:
+    """
+    Create a DOCX from the given text. This is a simple linear reconstruction.
+    More advanced layout preservation (headers/footers, bold/italics, alignment)
+    would require parsing source structure (e.g., DOCX XML, hOCR/ALTO) and mapping styles.
+    """
+    docx = _safe_import("docx")
+    if not docx:
+        raise HTTPException(
+            500, detail="python-docx not installed; cannot build DOCX output"
+        )
+    from docx import Document  # type: ignore
+    from docx.shared import Pt  # type: ignore
+    from docx.enum.text import WD_ALIGN_PARAGRAPH  # type: ignore
+    doc = Document()
+    # Set a base style
+    style = doc.styles["Normal"]
+    style.font.name = "Calibri"
+    style.font.size = Pt(11)
+    # Simple heuristic: split into paragraphs by blank lines
+    blocks = [b for b in modified_text.split("\n\n")]
+    for block in blocks:
+        p = doc.add_paragraph()
+        p.alignment = WD_ALIGN_PARAGRAPH.LEFT
+        for line in block.splitlines():
+            if p.text:
+                p.add_run().add_break()
+            p.add_run(line)
+    # Footer marker: if applicable
+    # We won't attempt automatic header/footer reconstruction here
+    out = io.BytesIO()
+    doc.save(out)
+    out.seek(0)
+    return out.read()
+# ---------------------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------------------
+@app.get("/", response_class=PlainTextResponse)
+def root():
+    return (
+        "Document Enhancer Backend (FastAPI)\n"
+        f"Running in Spaces: {_in_spaces()}\n"
+        "Open /docs for API spec.\n"
+    )
+@app.get("/healthz")
+def healthz():
+    return {"ok": True}
+@app.get("/env")
+def env():
+    # For debugging in Spaces
+    return _env_info()
+@app.post("/enhance")
+async def enhance_document(
+    file: UploadFile = File(
+        ..., description="Input document (PDF, DOCX, image, or TXT)"
+    ),
+    prompt: str = Form(
+        ...,
+        description='Instruction, e.g., "make summary at the end" or "change name from X to Y"',
+    ),
+    output_format: str = Form("docx", description="Output format: docx (default)"),
+):
+    """
+    Upload a document and a prompt. The backend will:
+    1) Extract text (and best-effort layout markers).
+    2) Send text + prompt to Gemini for editing.
+    3) Rebuild a document (DOCX by default) with the modified text.
+    Note:
+    - 100% layout preservation is a hard problem; this endpoint currently focuses on correctness of text edits first,
+      with a simple reconstruction. Extending to hOCR/ALTO -> DOCX/PDF reconstruction is possible with more code and deps.
+    """
+    # Read upload
+    raw = _read_bytes(file)
+    if not raw:
+        raise HTTPException(400, detail="Empty file")
+    # Determine file kind and save to temp
+    file_kind = _detect_file_kind(file.filename, file.content_type)
+    suffix = os.path.splitext(file.filename or "upload.bin")[1] or ".bin"
+    temp_path = _save_to_temp(raw, suffix=suffix)
+    logger.info("Saved upload to %s (%s)", temp_path, file_kind)
+    try:
+        # 1) Extract text and layout
+        plain_text, layout_info = extract_text_and_layout(temp_path, file_kind)
+        if not plain_text.strip():
+            raise HTTPException(422, detail="Could not extract text from the document")
+        # 2) Edit with Gemini
+        modified_text = await gemini_edit_text(prompt=prompt, text=plain_text)
+        # 3) Build output
+        out_fmt = (output_format or "docx").lower()
+        if out_fmt not in ("docx",):
+            raise HTTPException(
+                400, detail="Only docx output is supported in this entrypoint"
+            )
+        out_bytes = build_docx_from_text(modified_text)
+        out_name_base = (
+            os.path.splitext(os.path.basename(file.filename or "document"))[0]
+            or "document"
+        )
+        out_name = f"{out_name_base}-enhanced-{uuid.uuid4().hex[:8]}.docx"
+        return StreamingResponse(
+            io.BytesIO(out_bytes),
+            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            headers={"Content-Disposition": f'attachment; filename="{out_name}"'},
+        )
+    finally:
+        try:
+            os.remove(temp_path)
+        except Exception:
+            pass
+# ---------------------------------------------------------------------------------------
+# Local dev entrypoint (HF Spaces use a Procfile or just auto-run with uvicorn)
+# ---------------------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("PORT", "7860"))  # HF Spaces default
+    uvicorn.run(
+        "app:app", host="0.0.0.0", port=port, reload=not _in_spaces(), log_level="info"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# FastAPI app
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+python-multipart>=0.0.9
+# HTTP client for Gemini REST calls
+httpx>=0.27.0
+# DOCX building
+python-docx>=1.1.2
+# PDF/digital text extraction
+pdfminer.six>=20231228
+# OCR stack (note: requires system packages in packages.txt: tesseract-ocr, poppler-utils)
+pytesseract>=0.3.10
+pdf2image>=1.17.0
+Pillow>=10.3.0