Spaces:

fruk19
/

TYPHOON_OCR_DEMO

Runtime error

App Files Files Community

fruk19 commited on Dec 12, 2025

Commit

1dc843e

verified ·

1 Parent(s): af64003

update resize image

Browse files

Files changed (1) hide show

app.py +48 -68

app.py CHANGED Viewed

@@ -14,28 +14,27 @@ def resolve_file(file):
     """
     Normalize Gradio file object into a real filesystem file path.
     Handles:
-      - dict {name, data} (HF Spaces)
       - NamedString
       - tempfile object
     """
-    # Case 1: HF dict format
     if isinstance(file, dict) and "data" in file:
         raw = file["data"]
-        fname = file.get("name", f"file_{uuid.uuid4().hex}")
-        tmp_path = f"/tmp/{uuid.uuid4().hex}_{os.path.basename(fname)}"
-        with open(tmp_path, "wb") as f:
             f.write(raw if isinstance(raw, bytes) else raw.read())
-        return tmp_path
-    # Case 2: Gradio NamedString
     if hasattr(file, "name") and not hasattr(file, "path"):
         tmp_path = f"/tmp/{uuid.uuid4().hex}_{os.path.basename(file.name)}"
         with open(tmp_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         return tmp_path
-    # Case 3: local tempfile
     if hasattr(file, "name"):
         return file.name
@@ -43,16 +42,37 @@ def resolve_file(file):
 # ================================================================
-# Helper: Resize (OCR version + Preview version)
 # ================================================================
-def resize_if_needed(img, max_size=1024):
     w, h = img.size
-    if max(w, h) <= max_size:
         return img
-    scale = max_size / max(w, h)
-    return img.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
 def resize_preview(img, max_size=400):
     w, h = img.size
     if max(w, h) <= max_size:
@@ -62,7 +82,7 @@ def resize_preview(img, max_size=400):
 # ================================================================
-# Typhoon OCR call
 # ================================================================
 def run_typhoon_ocr(img_bytes, api_key, model, task_type,
                     max_tokens, temperature, top_p, repetition_penalty):
@@ -123,6 +143,7 @@ def pdf_to_images_pymupdf(pdf_path, dpi=220):
 # ================================================================
 def preview_files(files):
     previews = []
     for file in files:
         real_path = resolve_file(file)
         fp = real_path.lower()
@@ -130,18 +151,20 @@ def preview_files(files):
         if fp.endswith(".pdf"):
             pdf_imgs = pdf_to_images_pymupdf(real_path, dpi=120)
             for img in pdf_imgs:
                 previews.append(resize_preview(img))
         else:
             img = Image.open(real_path)
             if img.mode == "RGBA":
                 img = img.convert("RGB")
             previews.append(resize_preview(img))
     return previews
 # ================================================================
-# OCR 1 page (parallel)
 # ================================================================
 def ocr_single_page(page_img, label,
                     api_key, model, task_type, max_tokens,
@@ -152,8 +175,7 @@ def ocr_single_page(page_img, label,
     buf.seek(0)
     txt = run_typhoon_ocr(
-        buf.getvalue(),
-        api_key, model, task_type,
         max_tokens, temperature, top_p, repetition_penalty
     )
     return label, txt
@@ -173,9 +195,7 @@ def extract_text(files,
     images_to_ocr = []
     labels = []
-    # ---------------------------
     # LOAD FILES
-    # ---------------------------
     for file in files:
         real_path = resolve_file(file)
         fp = real_path.lower()
@@ -183,23 +203,23 @@ def extract_text(files,
         if fp.endswith(".pdf"):
             pdf_imgs = pdf_to_images_pymupdf(real_path, dpi=220)
             for idx, img in enumerate(pdf_imgs, start=1):
-                images_to_ocr.append(resize_if_needed(img))
                 labels.append(f"{os.path.basename(real_path)} - Page {idx}")
         else:
             img = Image.open(real_path)
             if img.mode == "RGBA":
                 img = img.convert("RGB")
-            images_to_ocr.append(resize_if_needed(img))
             labels.append(os.path.basename(real_path))
     total = len(images_to_ocr)
     progress(0.03, desc=f"Loaded {total} pages/images")
-    # ---------------------------
     # PARALLEL OCR
-    # ---------------------------
-    start = time.time()
     results = {}
     with ThreadPoolExecutor(max_workers=4) as ex:
         futures = []
@@ -221,13 +241,12 @@ def extract_text(files,
             elapsed = time.time() - start
             eta = (total - done) * (elapsed / max(done, 1))
-            progress(done / total, desc=f"OCR {done}/{total} | ETA {eta:.1f}s")
     progress(1, desc="OCR Completed ✔")
-    # ---------------------------
     # MERGE RESULT
-    # ---------------------------
     merged = ""
     for lbl in sorted(results.keys()):
         merged += f"## {lbl}\n{results[lbl]}\n\n"
@@ -242,20 +261,6 @@ def extract_text(files,
 # ================================================================
 # UI
 # ================================================================
-# with gr.Blocks() as demo:
-#     gr.Markdown("""
-#     # 🔍 Typhoon OCR v1.5
-#     ### Multi-file OCR • Parallel Processing • ETA • PDF/Image Support
-#     ⚡ **High-speed OCR powered by Typhoon**
-#     📄 Upload **multiple images or PDFs**
-#     🚀 Parallel OCR with ETA per page
-#     🔍 Auto preview grid for all pages
-#     🔑 **Get your API Key:**
-#     👉 https://playground.opentyphoon.ai/settings/api-key
-#     """)
 with gr.Blocks() as demo:
     gr.Markdown("""
@@ -264,7 +269,7 @@ with gr.Blocks() as demo:
 ⚡ **High-speed OCR powered by Typhoon**
 📄 Upload **multiple images or PDFs**
-🚀 Parallel OCR with ETA per page
 🔍 Auto preview grid for all pages
 ---
@@ -278,29 +283,6 @@ Click it to generate or copy your key.
     gr.Markdown("### 📘 How to get API Key (step-by-step)")
-#     gr.HTML("""
-# <div style='display:flex; gap:24px; margin-top:10px;'>
-#   <div style='text-align:center;'>
-#     <img src='https://huggingface.co/spaces/fruk19/TYPHOON_OCR_DEMO/resolve/main/ocr_login.png'
-#          style='width:260px; border-radius:8px; border:1px solid #ccc;'>
-#     <p><b>1) Login</b></p>
-#   </div>
-#   <div style='text-align:center;'>
-#     <img src='https://huggingface.co/spaces/fruk19/TYPHOON_OCR_DEMO/resolve/main/ocr_first.png'
-#          style='width:260px; border-radius:8px; border:1px solid #ccc;'>
-#     <p><b>2) Find API Key Menu</b></p>
-#   </div>
-#   <div style='text-align:center;'>
-#     <img src='https://huggingface.co/spaces/fruk19/TYPHOON_OCR_DEMO/resolve/main/ocr_getkey.png'
-#          style='width:260px; border-radius:8px; border:1px solid #ccc;'>
-#     <p><b>3) Copy Your Key</b></p>
-#   </div>
-# </div>
-# """)
     with gr.Row():
         gr.Gallery(
             [
@@ -308,12 +290,10 @@ Click it to generate or copy your key.
                 ("ocr_first.png", "2) Find API Key Menu"),
                 ("ocr_getkey.png", "3) Copy Your Key"),
             ],
-            label="How to Get Your API Key (click to zoom)",
             columns=3,
             height=250,
             show_label=False,
         )
     file_input = gr.Files(label="Upload images or PDFs", file_count="multiple")

     """
     Normalize Gradio file object into a real filesystem file path.
     Handles:
+      - dict {name, data}  (HF Spaces)
       - NamedString
       - tempfile object
     """
+    # Case 1: HF dict
     if isinstance(file, dict) and "data" in file:
         raw = file["data"]
+        fname = file.get("name", f"{uuid.uuid4().hex}.bin")
+        path = f"/tmp/{uuid.uuid4().hex}_{os.path.basename(fname)}"
+        with open(path, "wb") as f:
             f.write(raw if isinstance(raw, bytes) else raw.read())
+        return path
+    # Case 2: NamedString (file.name only)
     if hasattr(file, "name") and not hasattr(file, "path"):
         tmp_path = f"/tmp/{uuid.uuid4().hex}_{os.path.basename(file.name)}"
         with open(tmp_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         return tmp_path
+    # Case 3: normal tempfile with path
     if hasattr(file, "name"):
         return file.name
 # ================================================================
+# UNIVERSAL RESIZE: max bounds 800×1800, 1800×800, 1200×1200
 # ================================================================
+def resize_to_max_bounds(img,
+                         max_w1=800, max_h1=1800,
+                         max_w2=1800, max_h2=800,
+                         max_ws=1200, max_hs=1200):
+    """Resize image so it stays under max bounds while preserving aspect ratio."""
     w, h = img.size
+    bounds = [
+        (max_w1, max_h1),
+        (max_w2, max_h2),
+        (max_ws, max_hs),
+    ]
+    scale = 1.0
+    for max_w, max_h in bounds:
+        scale_w = max_w / w
+        scale_h = max_h / h
+        scale = min(scale, min(scale_w, scale_h))
+    if scale >= 1.0:
         return img
+    new_size = (int(w * scale), int(h * scale))
+    return img.resize(new_size, Image.Resampling.LANCZOS)
+# ================================================================
+# Preview resize
+# ================================================================
 def resize_preview(img, max_size=400):
     w, h = img.size
     if max(w, h) <= max_size:
 # ================================================================
+# Typhoon OCR API call
 # ================================================================
 def run_typhoon_ocr(img_bytes, api_key, model, task_type,
                     max_tokens, temperature, top_p, repetition_penalty):
 # ================================================================
 def preview_files(files):
     previews = []
     for file in files:
         real_path = resolve_file(file)
         fp = real_path.lower()
         if fp.endswith(".pdf"):
             pdf_imgs = pdf_to_images_pymupdf(real_path, dpi=120)
             for img in pdf_imgs:
+                img = resize_to_max_bounds(img)
                 previews.append(resize_preview(img))
         else:
             img = Image.open(real_path)
             if img.mode == "RGBA":
                 img = img.convert("RGB")
+            img = resize_to_max_bounds(img)
             previews.append(resize_preview(img))
     return previews
 # ================================================================
+# OCR 1 PAGE (PARALLEL)
 # ================================================================
 def ocr_single_page(page_img, label,
                     api_key, model, task_type, max_tokens,
     buf.seek(0)
     txt = run_typhoon_ocr(
+        buf.getvalue(), api_key, model, task_type,
         max_tokens, temperature, top_p, repetition_penalty
     )
     return label, txt
     images_to_ocr = []
     labels = []
     # LOAD FILES
     for file in files:
         real_path = resolve_file(file)
         fp = real_path.lower()
         if fp.endswith(".pdf"):
             pdf_imgs = pdf_to_images_pymupdf(real_path, dpi=220)
             for idx, img in enumerate(pdf_imgs, start=1):
+                img = resize_to_max_bounds(img)
+                images_to_ocr.append(img)
                 labels.append(f"{os.path.basename(real_path)} - Page {idx}")
         else:
             img = Image.open(real_path)
             if img.mode == "RGBA":
                 img = img.convert("RGB")
+            img = resize_to_max_bounds(img)
+            images_to_ocr.append(img)
             labels.append(os.path.basename(real_path))
     total = len(images_to_ocr)
     progress(0.03, desc=f"Loaded {total} pages/images")
     # PARALLEL OCR
     results = {}
+    start = time.time()
     with ThreadPoolExecutor(max_workers=4) as ex:
         futures = []
             elapsed = time.time() - start
             eta = (total - done) * (elapsed / max(done, 1))
+            progress(done / total,
+                     desc=f"OCR {done}/{total} | ETA {eta:.1f}s")
     progress(1, desc="OCR Completed ✔")
     # MERGE RESULT
     merged = ""
     for lbl in sorted(results.keys()):
         merged += f"## {lbl}\n{results[lbl]}\n\n"
 # ================================================================
 # UI
 # ================================================================
 with gr.Blocks() as demo:
     gr.Markdown("""
 ⚡ **High-speed OCR powered by Typhoon**
 📄 Upload **multiple images or PDFs**
+🚀 Parallel OCR with ETA
 🔍 Auto preview grid for all pages
 ---
     gr.Markdown("### 📘 How to get API Key (step-by-step)")
     with gr.Row():
         gr.Gallery(
             [
                 ("ocr_first.png", "2) Find API Key Menu"),
                 ("ocr_getkey.png", "3) Copy Your Key"),
             ],
             columns=3,
             height=250,
             show_label=False,
         )
     file_input = gr.Files(label="Upload images or PDFs", file_count="multiple")