omgy commited on
Commit
ec56450
·
verified ·
1 Parent(s): 07738ea

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +43 -0
  2. app.py +454 -0
  3. requirements.txt +18 -0
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1.6
2
+
3
+ # Base Python image
4
+ FROM python:3.11-slim
5
+
6
+ # Environment for reliable, quiet, and unbuffered Python
7
+ ENV PYTHONDONTWRITEBYTECODE=1 \
8
+ PYTHONUNBUFFERED=1 \
9
+ PIP_NO_CACHE_DIR=1 \
10
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
11
+ PORT=7860
12
+
13
+ # Install system packages required for OCR and PDF rasterization
14
+ # - tesseract-ocr and language data (eng)
15
+ # - poppler-utils provides `pdftoppm` used by pdf2image
16
+ # - libgl1 needed by some Pillow operations in headless containers
17
+ RUN apt-get update && \
18
+ apt-get install -y --no-install-recommends \
19
+ tesseract-ocr \
20
+ tesseract-ocr-eng \
21
+ poppler-utils \
22
+ libgl1 \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # Set tessdata path (generally correct for Debian-based images)
26
+ ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
27
+
28
+ # App directory
29
+ WORKDIR /app
30
+
31
+ # Install Python dependencies first for better layer caching
32
+ COPY requirements.txt /app/requirements.txt
33
+ RUN pip install --no-cache-dir -r /app/requirements.txt
34
+
35
+ # Copy application code
36
+ COPY . /app
37
+
38
+ # Expose default HF Spaces port
39
+ EXPOSE 7860
40
+
41
+ # Start the FastAPI server
42
+ # Note: Hugging Face sets PORT env var, but we default to 7860
43
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import uuid
4
+ import json
5
+ import shutil
6
+ import logging
7
+ import mimetypes
8
+ import tempfile
9
+ from typing import Optional, Tuple
10
+
11
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
14
+
15
+ # ---------------------------------------------------------------------------------------
16
+ # App setup
17
+ # ---------------------------------------------------------------------------------------
18
+
19
+ app = FastAPI(
20
+ title="Document Enhancer Backend",
21
+ description=(
22
+ "A FastAPI backend suitable for Hugging Face Spaces. "
23
+ "It accepts a document and a prompt, extracts text/layout, calls Gemini for edits, "
24
+ "and rebuilds a document with the requested tweaks. "
25
+ "Note: Full layout preservation with OCR is complex; this entrypoint provides a working scaffold."
26
+ ),
27
+ version="0.1.0",
28
+ )
29
+
30
+ # Allow CORS for web UIs hosted elsewhere
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"], # Restrict in production
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ logger = logging.getLogger("uvicorn.error")
40
+
41
+
42
+ # ---------------------------------------------------------------------------------------
43
+ # Utilities
44
+ # ---------------------------------------------------------------------------------------
45
+
46
+
47
+ def _in_spaces() -> bool:
48
+ # Heuristic env flag when running on Hugging Face Space
49
+ return bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE") or os.getenv("SYSTEM"))
50
+
51
+
52
+ def _env_info() -> dict:
53
+ return {
54
+ "running_in_spaces": _in_spaces(),
55
+ "python": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
56
+ "gemini_api_key_set": bool(os.getenv("GEMINI_API_KEY")),
57
+ "tesseract_cmd": os.getenv("TESSERACT_CMD"),
58
+ "tessdata_prefix": os.getenv("TESSDATA_PREFIX"),
59
+ }
60
+
61
+
62
+ def _safe_import(module_name: str):
63
+ try:
64
+ module = __import__(module_name)
65
+ return module
66
+ except Exception as e:
67
+ logger.warning(
68
+ "Optional dependency not available: %s (%s)", module_name, str(e)
69
+ )
70
+ return None
71
+
72
+
73
+ def _detect_file_kind(filename: str, content_type: Optional[str]) -> str:
74
+ name = (filename or "").lower()
75
+ if content_type:
76
+ ct = content_type.lower()
77
+ if "pdf" in ct:
78
+ return "pdf"
79
+ if "word" in ct or "docx" in ct or name.endswith(".docx"):
80
+ return "docx"
81
+ if "image" in ct or any(
82
+ name.endswith(ext)
83
+ for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"]
84
+ ):
85
+ return "image"
86
+ if "text" in ct or name.endswith(".txt"):
87
+ return "text"
88
+ # Fallback by extension
89
+ if name.endswith(".pdf"):
90
+ return "pdf"
91
+ if name.endswith(".docx"):
92
+ return "docx"
93
+ if any(
94
+ name.endswith(ext)
95
+ for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"]
96
+ ):
97
+ return "image"
98
+ if name.endswith(".txt"):
99
+ return "text"
100
+ return "unknown"
101
+
102
+
103
+ def _read_bytes(file: UploadFile, max_size_mb: int = 40) -> bytes:
104
+ """
105
+ Read the uploaded file into memory with a soft limit to avoid crashing small instances.
106
+ """
107
+ limit = max_size_mb * 1024 * 1024
108
+ buf = io.BytesIO()
109
+ total = 0
110
+ while True:
111
+ chunk = file.file.read(1024 * 1024)
112
+ if not chunk:
113
+ break
114
+ total += len(chunk)
115
+ if total > limit:
116
+ raise HTTPException(
117
+ 413, detail=f"File too large. Max allowed is {max_size_mb} MB."
118
+ )
119
+ buf.write(chunk)
120
+ return buf.getvalue()
121
+
122
+
123
+ def _save_to_temp(data: bytes, suffix: str) -> str:
124
+ fd, path = tempfile.mkstemp(suffix=suffix)
125
+ os.close(fd)
126
+ with open(path, "wb") as f:
127
+ f.write(data)
128
+ return path
129
+
130
+
131
+ # ---------------------------------------------------------------------------------------
132
+ # Extraction
133
+ # ---------------------------------------------------------------------------------------
134
+
135
+
136
+ def extract_text_and_layout(
137
+ temp_path: str, file_kind: str
138
+ ) -> Tuple[str, Optional[str]]:
139
+ """
140
+ Best-effort extractor:
141
+ - PDF: try pdfminer for digital text; if empty, try OCR via pytesseract + pdf2image.
142
+ - DOCX: extract paragraphs via python-docx.
143
+ - IMAGE: OCR via pytesseract.
144
+ - TEXT: read as text.
145
+ Returns (plain_text, layout_info) where layout_info may be hOCR or None.
146
+ """
147
+ plain_text = ""
148
+ layout_info = None
149
+
150
+ if file_kind == "pdf":
151
+ pdfminer = _safe_import("pdfminer")
152
+ if pdfminer:
153
+ try:
154
+ from pdfminer.high_level import extract_text
155
+
156
+ plain_text = extract_text(temp_path) or ""
157
+ except Exception as e:
158
+ logger.warning("pdfminer failed: %s", str(e))
159
+ if not plain_text.strip():
160
+ # Try OCR
161
+ try:
162
+ pytesseract = _safe_import("pytesseract")
163
+ pdf2image = _safe_import("pdf2image")
164
+ if not pytesseract or not pdf2image:
165
+ raise RuntimeError(
166
+ "OCR dependencies (pytesseract/pdf2image) not available"
167
+ )
168
+ from pdf2image import convert_from_path
169
+
170
+ images = convert_from_path(temp_path, dpi=300)
171
+ ocr_texts = []
172
+ hocr_blobs = []
173
+ for img in images:
174
+ ocr_texts.append(pytesseract.image_to_string(img))
175
+ hocr_blobs.append(
176
+ pytesseract.image_to_pdf_or_hocr(img, extension="hocr").decode(
177
+ "utf-8", errors="ignore"
178
+ )
179
+ )
180
+ plain_text = "\n".join(ocr_texts)
181
+ layout_info = "\n".join(hocr_blobs)
182
+ except Exception as e:
183
+ logger.warning("PDF OCR failed: %s", str(e))
184
+ if not plain_text:
185
+ plain_text = ""
186
+ elif file_kind == "docx":
187
+ docx = _safe_import("docx")
188
+ if not docx:
189
+ raise HTTPException(
190
+ 500, detail="python-docx not installed; cannot process DOCX"
191
+ )
192
+ try:
193
+ from docx import Document
194
+
195
+ doc = Document(temp_path)
196
+ plain_text = "\n".join([p.text for p in doc.paragraphs])
197
+ except Exception as e:
198
+ logger.error("DOCX extract failed: %s", str(e))
199
+ raise HTTPException(500, detail=f"Failed to read DOCX: {str(e)}")
200
+ elif file_kind == "image":
201
+ try:
202
+ pytesseract = _safe_import("pytesseract")
203
+ PIL = _safe_import("PIL")
204
+ if not pytesseract or not PIL:
205
+ raise RuntimeError(
206
+ "OCR dependencies (pytesseract/Pillow) not available"
207
+ )
208
+ from PIL import Image
209
+
210
+ img = Image.open(temp_path)
211
+ plain_text = pytesseract.image_to_string(img)
212
+ layout_info = pytesseract.image_to_pdf_or_hocr(
213
+ img, extension="hocr"
214
+ ).decode("utf-8", errors="ignore")
215
+ except Exception as e:
216
+ logger.error("Image OCR failed: %s", str(e))
217
+ raise HTTPException(500, detail=f"OCR failed: {str(e)}")
218
+ elif file_kind == "text":
219
+ try:
220
+ with open(temp_path, "rb") as f:
221
+ raw = f.read()
222
+ # Try utf-8 first
223
+ try:
224
+ plain_text = raw.decode("utf-8")
225
+ except UnicodeDecodeError:
226
+ plain_text = raw.decode("latin-1", errors="ignore")
227
+ except Exception as e:
228
+ raise HTTPException(500, detail=f"Failed to read text: {str(e)}")
229
+ else:
230
+ raise HTTPException(
231
+ 415, detail="Unsupported file type. Please upload PDF, DOCX, image, or TXT."
232
+ )
233
+
234
+ return plain_text, layout_info
235
+
236
+
237
+ # ---------------------------------------------------------------------------------------
238
+ # Gemini Integration
239
+ # ---------------------------------------------------------------------------------------
240
+
241
+
242
+ async def gemini_edit_text(prompt: str, text: str) -> str:
243
+ """
244
+ Call Gemini 2.5 Flash to transform text according to prompt.
245
+ Falls back to echo if GEMINI_API_KEY is not set.
246
+ """
247
+ api_key = os.getenv("GEMINI_API_KEY")
248
+ if not api_key:
249
+ # Fallback: return the original text with an annotation.
250
+ logger.warning("GEMINI_API_KEY not set; returning original text as fallback")
251
+ return f"{text}\n\n[Note: GEMINI_API_KEY not set. This is a fallback output without AI edits.]"
252
+
253
+ httpx = _safe_import("httpx")
254
+ if not httpx:
255
+ logger.warning("httpx not installed; returning original text as fallback")
256
+ return f"{text}\n\n[Note: httpx not installed. This is a fallback output without AI edits.]"
257
+
258
+ import httpx as _httpx # type: ignore
259
+
260
+ url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
261
+ params = {"key": api_key}
262
+ # Keep prompt and doc text concise enough for free tier
263
+ user_text = f"Instruction:\n{prompt}\n\nDocument:\n{text}"
264
+
265
+ payload = {
266
+ "contents": [
267
+ {
268
+ "role": "user",
269
+ "parts": [{"text": user_text[:800000]}],
270
+ } # basic truncation safety
271
+ ]
272
+ }
273
+
274
+ try:
275
+ async with _httpx.AsyncClient(timeout=120) as client:
276
+ resp = await client.post(url, params=params, json=payload)
277
+ if resp.status_code != 200:
278
+ detail = resp.text
279
+ logger.error("Gemini API error %s: %s", resp.status_code, detail)
280
+ raise HTTPException(502, detail=f"Gemini API error: {detail}")
281
+
282
+ data = resp.json()
283
+ # Parse response
284
+ candidates = data.get("candidates", [])
285
+ if not candidates:
286
+ logger.error(
287
+ "Gemini returned no candidates: %s", json.dumps(data)[:500]
288
+ )
289
+ raise HTTPException(502, detail="Gemini returned no candidates")
290
+ parts = candidates[0].get("content", {}).get("parts", [])
291
+ if not parts:
292
+ logger.error("Gemini returned no parts: %s", json.dumps(data)[:500])
293
+ raise HTTPException(502, detail="Gemini returned empty response")
294
+ out_text = parts[0].get("text", "")
295
+ if not out_text.strip():
296
+ logger.warning("Gemini returned empty text; using original")
297
+ return text
298
+ return out_text
299
+ except HTTPException:
300
+ raise
301
+ except Exception as e:
302
+ logger.exception("Gemini call failed: %s", str(e))
303
+ raise HTTPException(502, detail=f"Gemini call failed: {str(e)}")
304
+
305
+
306
+ # ---------------------------------------------------------------------------------------
307
+ # Rebuilding document
308
+ # ---------------------------------------------------------------------------------------
309
+
310
+
311
+ def build_docx_from_text(modified_text: str) -> bytes:
312
+ """
313
+ Create a DOCX from the given text. This is a simple linear reconstruction.
314
+ More advanced layout preservation (headers/footers, bold/italics, alignment)
315
+ would require parsing source structure (e.g., DOCX XML, hOCR/ALTO) and mapping styles.
316
+ """
317
+ docx = _safe_import("docx")
318
+ if not docx:
319
+ raise HTTPException(
320
+ 500, detail="python-docx not installed; cannot build DOCX output"
321
+ )
322
+
323
+ from docx import Document # type: ignore
324
+ from docx.shared import Pt # type: ignore
325
+ from docx.enum.text import WD_ALIGN_PARAGRAPH # type: ignore
326
+
327
+ doc = Document()
328
+ # Set a base style
329
+ style = doc.styles["Normal"]
330
+ style.font.name = "Calibri"
331
+ style.font.size = Pt(11)
332
+
333
+ # Simple heuristic: split into paragraphs by blank lines
334
+ blocks = [b for b in modified_text.split("\n\n")]
335
+
336
+ for block in blocks:
337
+ p = doc.add_paragraph()
338
+ p.alignment = WD_ALIGN_PARAGRAPH.LEFT
339
+ for line in block.splitlines():
340
+ if p.text:
341
+ p.add_run().add_break()
342
+ p.add_run(line)
343
+
344
+ # Footer marker: if applicable
345
+ # We won't attempt automatic header/footer reconstruction here
346
+ out = io.BytesIO()
347
+ doc.save(out)
348
+ out.seek(0)
349
+ return out.read()
350
+
351
+
352
+ # ---------------------------------------------------------------------------------------
353
+ # Routes
354
+ # ---------------------------------------------------------------------------------------
355
+
356
+
357
+ @app.get("/", response_class=PlainTextResponse)
358
+ def root():
359
+ return (
360
+ "Document Enhancer Backend (FastAPI)\n"
361
+ f"Running in Spaces: {_in_spaces()}\n"
362
+ "Open /docs for API spec.\n"
363
+ )
364
+
365
+
366
+ @app.get("/healthz")
367
+ def healthz():
368
+ return {"ok": True}
369
+
370
+
371
+ @app.get("/env")
372
+ def env():
373
+ # For debugging in Spaces
374
+ return _env_info()
375
+
376
+
377
+ @app.post("/enhance")
378
+ async def enhance_document(
379
+ file: UploadFile = File(
380
+ ..., description="Input document (PDF, DOCX, image, or TXT)"
381
+ ),
382
+ prompt: str = Form(
383
+ ...,
384
+ description='Instruction, e.g., "make summary at the end" or "change name from X to Y"',
385
+ ),
386
+ output_format: str = Form("docx", description="Output format: docx (default)"),
387
+ ):
388
+ """
389
+ Upload a document and a prompt. The backend will:
390
+ 1) Extract text (and best-effort layout markers).
391
+ 2) Send text + prompt to Gemini for editing.
392
+ 3) Rebuild a document (DOCX by default) with the modified text.
393
+
394
+ Note:
395
+ - 100% layout preservation is a hard problem; this endpoint currently focuses on correctness of text edits first,
396
+ with a simple reconstruction. Extending to hOCR/ALTO -> DOCX/PDF reconstruction is possible with more code and deps.
397
+ """
398
+ # Read upload
399
+ raw = _read_bytes(file)
400
+ if not raw:
401
+ raise HTTPException(400, detail="Empty file")
402
+
403
+ # Determine file kind and save to temp
404
+ file_kind = _detect_file_kind(file.filename, file.content_type)
405
+ suffix = os.path.splitext(file.filename or "upload.bin")[1] or ".bin"
406
+ temp_path = _save_to_temp(raw, suffix=suffix)
407
+ logger.info("Saved upload to %s (%s)", temp_path, file_kind)
408
+
409
+ try:
410
+ # 1) Extract text and layout
411
+ plain_text, layout_info = extract_text_and_layout(temp_path, file_kind)
412
+ if not plain_text.strip():
413
+ raise HTTPException(422, detail="Could not extract text from the document")
414
+
415
+ # 2) Edit with Gemini
416
+ modified_text = await gemini_edit_text(prompt=prompt, text=plain_text)
417
+
418
+ # 3) Build output
419
+ out_fmt = (output_format or "docx").lower()
420
+ if out_fmt not in ("docx",):
421
+ raise HTTPException(
422
+ 400, detail="Only docx output is supported in this entrypoint"
423
+ )
424
+
425
+ out_bytes = build_docx_from_text(modified_text)
426
+ out_name_base = (
427
+ os.path.splitext(os.path.basename(file.filename or "document"))[0]
428
+ or "document"
429
+ )
430
+ out_name = f"{out_name_base}-enhanced-{uuid.uuid4().hex[:8]}.docx"
431
+
432
+ return StreamingResponse(
433
+ io.BytesIO(out_bytes),
434
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
435
+ headers={"Content-Disposition": f'attachment; filename="{out_name}"'},
436
+ )
437
+ finally:
438
+ try:
439
+ os.remove(temp_path)
440
+ except Exception:
441
+ pass
442
+
443
+
444
+ # ---------------------------------------------------------------------------------------
445
+ # Local dev entrypoint (HF Spaces use a Procfile or just auto-run with uvicorn)
446
+ # ---------------------------------------------------------------------------------------
447
+
448
+ if __name__ == "__main__":
449
+ import uvicorn
450
+
451
+ port = int(os.getenv("PORT", "7860")) # HF Spaces default
452
+ uvicorn.run(
453
+ "app:app", host="0.0.0.0", port=port, reload=not _in_spaces(), log_level="info"
454
+ )
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI app
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ python-multipart>=0.0.9
5
+
6
+ # HTTP client for Gemini REST calls
7
+ httpx>=0.27.0
8
+
9
+ # DOCX building
10
+ python-docx>=1.1.2
11
+
12
+ # PDF/digital text extraction
13
+ pdfminer.six>=20231228
14
+
15
+ # OCR stack (note: requires system packages in packages.txt: tesseract-ocr, poppler-utils)
16
+ pytesseract>=0.3.10
17
+ pdf2image>=1.17.0
18
+ Pillow>=10.3.0